---
title: "Preparation of PSID Data"
format:
  html:
    toc: true
---

# Introduction

This notebook contains code for transforming raw PSID data into the final datasets `MotherPanelCDS.csv` and `ChildPanelCDS.csv`. 

## Dependencies

The block of code below loads all the packages used in the process of doing this work.

```{r}
library(tidyverse)
library(foreign)

```

# Arranging Main Interview PSID Data

## Identifiers Panel

The main interview is administered at the household level, given by a survey year and an interview number (``intnum``). Here we make a panel linking each individual to an interview number in each survey year.

```{r}
D <- readxl::read_excel("../../../data/data_PSID/main/identifiers/CrossYearIndex.xlsx")
D <- D[-1]

index <- read.csv("../../../data/data_PSID/main/identifiers/CrossYearIndex_labels.csv")
names(index) <- c("variable","label","year")
index$label <- str_c(index$label,'_',index$year) 
index <- index[-3]

#everything should be renamed #<- names
for (i in 1:length(D)){
  names(D)[i] = as.character(index[["label"]][i])
}

D <- D %>% pivot_longer(
  cols=!c(in_1968,pn_1968),
  names_to = c("variable","year"),
  names_sep = "_",
  values_to="value")

D <- D %>%
  pivot_wider(names_from=variable,values_from=value) %>%
  mutate(year = as.integer(year))

names(D) <- c("intnum68","pernum","year","relhead",
               "mpair","intnum","sn")

D <- D[,c(1,2,3,6,7,4,5)] %>%
  filter(!is.na(intnum),intnum>0) #<- drop observations where an individual cannot be merged to an interview. We don't need those.

identifiers_panel <- D

```

## Main Interview: Childcare Expenditures

Load the childcare data and arrange it into a panel at the `intnum` level.

```{r}
D <- readxl::read_excel("../../../data/data_PSID/main/expenditures/childcare/Main_childcare.xlsx")

# these are all the years that the question is available
years <- c(seq(1988,1997),seq(1999,2017,2))

d <- data.frame(year = integer(), intnum = integer(), childcare_exp = double())

for (i in 1:length(years)) {
  d0 <- data.frame(year = years[i], intnum = D[[(i-1)*3+2]], childcare_exp = D[[(i-1)*3+3]])
  d <- rbind(d,na.omit(d0))
}

# code up missing values
d$childcare_exp[(d$childcare_exp>=99998) & d$year<1999] <- NA
d$childcare_exp[(d$childcare_exp>=999998)] <- NA

main_childcare <- d

```


## Main Interview: State of Residence

Load raw state of residence data and transform it into an `intnum` panel. Also loads a cross-walk from psid stat codes to FIPS and SOI.

```{r}
H <- read.csv("../../../data/data_PSID/main/state/State.csv")

index2 <- read.csv("../../../data/data_PSID/main/state/state-labels.csv",header=FALSE)
names(index2) <- c("variable","label","year")
index2$label <- str_c(index2$label,'_',index2$year) 
index2 <- index2[-3]

for (i in 1:length(H)){
  names(H)[i] = as.character(index2[["label"]][i])
}

H <- H %>% select(-contains("relnum")) #dropping release num

H <- H %>%
  pivot_longer(cols=everything(),names_to = c(".value","year"),names_sep = "_") %>%
  mutate(year = as.integer(year)) %>%
  filter(!is.na(intnum)) %>%
  filter(state!=0,state!=99) #<- drop missing observations

H <- H[,c(2,3,1)]

main_state <- H

state_codes <- read.csv("../../../data/data_PSID/StateCodes.csv") %>%
  mutate(X = NULL, X.1 = NULL) %>% #<- this is a simple crosswalk of state codes
  select(-state) %>%
  rename(state = PSID, StFIPS = fips) %>% #<- we rename some things in order to do the merge properly
  select(state, StFIPS, SOI, state_str)
```

## Main Interview: Parent and Relative Location

Create a variable indicating if the individual still lives in the state they were born in.

```{r}
# ---- Measure 1: using an indicator for whether the individual is living in the state they grew up in
years <- c(1968,seq(1969,1996),seq(1997,2019,2))

d1 <- readxl::read_excel("../../../data/data_PSID/main/parent-location/head_mobility.xlsx")

# create a horizontal file:
head_grow_up = data.frame() 
for (i in seq(1:length(years))) {
  d = data.frame()
  #d$year <- years[i]
  v_intnum <- names(d1)[(i-1)*3+2]
  d <- d1[,v_intnum] %>%
    rename(intnum = v_intnum)
  d$year <- years[i]
  #d$intnum <- d1[,v_intnum]
  v_loc <- names(d1)[i*3]
  d[,"state_grow_up"]<- d1[,v_loc]
  d <- d %>%
    drop_na()
  head_grow_up <- rbind(head_grow_up,d)
}

head_grow_up <-  head_grow_up %>%
  mutate(state_grow_up=na_if(state_grow_up,9))

```

Create a variable indicating the number of miles to parents.

```{r}

# ---- Measure 2: distance to parents in the 1988 main interview using a measure of distance from 1988:

# load and clean the data
# here we create a measure of whether either parent is within 10 miles or within 100 miles
parent_distance <- readxl::read_excel("../../../data/data_PSID/main/parent-location/parent_distance.xlsx") %>%
  rename(                      
         intnum = V14802,#     1988 INTERVIEW NUMBER                   
         head_miles_father = V15816,     # OF MILES TO FATHER                    
         head_miles_mother = V15830,     # OF MILES TO MOTHER                    
         sp_miles_father = V15873,     # OF MILES TO FATHER-W                  
         sp_miles_mother = V15887,     # OF MILES TO MOTHER-W                  
  ) %>%
  mutate(head_ten_miles = head_miles_father<=2 | head_miles_mother<=2,
         head_hundred_miles = head_miles_father<=3 | head_miles_mother<=3,
         sp_ten_miles = sp_miles_father<=2 | sp_miles_mother<=2,
         sp_hundred_miles = sp_miles_father<=3 | sp_miles_mother<=3) %>%
  drop_na()


```

Finally, an indicator for other relatives in the household.

```{r}
relative_present <- read.csv("../../../data/data_PSID/main/family_matrix/family_matrix.csv") %>%
  mutate(KID=MX5*1000 +   MX6)%>%
  #indicator: there is an older relative (grandparent or uncle/aunt) in the HH
  mutate(biol_grand=case_when(MX8==60 ~ 1,
                              #  MX8==94 ~ NA_real_,
                              TRUE ~ 0))%>%
  mutate(older_relative=case_when(MX8>=60 & MX8<=65 ~ 1,
                                  MX8>=71 & MX8<=71 ~ 1,
                                  MX8>=80 & MX8<=81 ~ 1,
                                  MX8==84 ~ 1,
                                  # MX8==94 ~ NA_real_,
                                  TRUE ~ 0))%>%
  group_by(KID, year) %>%
  mutate(older_relative_hh = max(older_relative, na.rm = TRUE), 
         biol_grand_hh = max(biol_grand, na.rm = TRUE) )%>%
  select(KID,year,older_relative_hh,  biol_grand_hh)%>%
  distinct(KID, year, .keep_all = TRUE)

```

## Main Interview: Race

```{r}

D <- readxl::read_excel("../../../data/data_PSID/main/race/Race.xlsx") %>% 
  rename(intnum = V3)


# let's create the data just by stacking
# two blocks: years with just head, and years with
years1 <- seq(1969,1984)
years2 <- c(seq(1985,1997),seq(1999,2017,2))

# here are the variable names we want to use, copied and pasted
# NOTE: we switched the order of head and spouse in 1985 (first appearance) in order to do things properly
vnames <- c("V441"   ,"V442"   ,"V801"   ,"V1101"  ,"V1102"  ,"V1490"  ,"V1801"  ,"V1802"  ,"V2202"  ,"V2401"  ,"V2402"  ,"V2828"  ,"V3001"  ,"V3002"  ,"V3300"  ,"V3401"  ,"V3402"  ,"V3720"  ,"V3801"  ,"V3802"  ,"V4204"  ,"V4301"  ,"V4302"  ,"V5096"  ,"V5201"  ,"V5202"  ,"V5662"  ,"V5701"  ,"V5702"  ,"V6209"  ,"V6301"  ,"V6302"  ,"V6802"  ,"V6901"  ,"V6902"  ,"V7447"  ,"V7501"  ,"V7502"  ,"V8099"  ,"V8201"  ,"V8202"  ,"V8723"  ,"V8801"  ,"V8802"  ,"V9408"  ,"V10001" ,"V10002" ,"V11055" ,"V11101" ,"V11102" ,"V12293" ,"V11938" ,"V12501" ,"V12502" ,"V13500" ,"V13565" ,"V13701" ,"V13702" ,"V14547" ,"V14612" ,"V14801" ,"V14802" ,"V16021" ,"V16086" ,"V16301" ,"V16302" ,"V17418" ,"V17483" ,"V17701" ,"V17702" ,"V18749" ,"V18814" ,"V19001" ,"V19002" ,"V20049" ,"V20114" ,"V20301" ,"V20302" ,"V21355" ,"V21420" ,"V21601" ,"V21602" ,"V23212" ,"V23276" ,"ER2001" ,"ER2002" ,"ER3883" ,"ER3944" ,"ER5001" ,"ER5002" ,"ER6753" ,"ER6814" ,"ER7001" ,"ER7002" ,"ER8999" ,"ER9060" ,"ER10001","ER10002","ER11760","ER11848","ER13001","ER13002","ER15836","ER15928","ER17001","ER17002","ER19897","ER19989","ER21001","ER21002","ER23334","ER23426","ER25001","ER25002","ER27297","ER27393","ER36001","ER36002","ER40472","ER40565","ER42001","ER42002","ER46449","ER46543","ER47301","ER47302","ER51810","ER51904","ER53001","ER53002","ER57549","ER57659","ER60001","ER60002","ER64671","ER64810","ER66001","ER66002","ER70744","ER70882")

V <- data.frame()

for (i in seq(1,length(years1))) {
  v1 = vnames[(i-1)*3+2]
  v2 = vnames[i*3]
  d <- select(D,c(v1,v2))
  d <- filter(d,!is.na(d[,v1]))
  names(d) <- c("intnum","race_head")
  d$year <- years1[i]
  d$race_wife <- NA
  V <- rbind(V,d)
  
  
}

for (i in seq(1,length(years2))) {
  j = i + length(years1)
  v1 = vnames[48+(i-1)*4+2]
  v2 = vnames[48+(i-1)*4+3]
  v3 = vnames[48 + i*4]
  d <- select(D,c(v1,v3,v2))
  d <- filter(d,!is.na(d[,v1]))
  names(d) <- c("intnum","race_head","race_wife")
  d$year <- years2[i]
  V <- rbind(V,d)
  
  
}

Ind = identifiers_panel %>% 
  mutate(ID = intnum68*1000 + pernum) %>%
  filter(year<=2011) #<- use only data from 2011 and earlier

V <- merge(V,Ind)
V$Race <- NA
V[V$sn==1,"Race"] <- V[V$sn==1,"race_head"]
V[V$sn==2,"Race"] <- V[V$sn==2,"race_wife"]

D <- V %>%
  filter(!is.na(Race) & Race!=0 & Race!=9) %>%
  group_by(ID) %>%
  arrange(year) %>%
  filter(row_number()==n()) %>% #<- keep the latest record
  select(ID,Race)

race <- D
  
```

## Main Interview: Education

```{r}
D <- readxl::read_excel("../../../data/data_PSID/main/education/grades_completed.xlsx") %>%
  mutate(ID = ER30001*1000 + ER30002)
D1 <- D[,seq(5,158,4)]

# this routine pulls out the most recent education variable, up to the year 2011.
# the data itself go until 2017, but we only go up to 2011 to replicate the data used in first draft of CLMP
D$educ <- -1
D$year_meas <- -1
years <- c(1968,seq(1970,1996),seq(1997,2017,2))

# replace with (i in 1:length(years)) if we want to update using measures from 2013-2017
for (i in 1:36) {
  print(i)
  I_use <- (D1[,i]>0) & (D1[,i]<98)
  D[I_use,"educ"] <- D1[I_use,i]
  D[I_use,"year_meas"] <- years[i]
}

# do some final data cleaning and save the file
education <- D %>%
  mutate(educ = na_if(educ,-1), 
         #ed=case_when(educ>16 ~ 5,educ == 16 ~ 4, educ>=13 ~ 3, educ>=12 ~ 2, educ<12 ~ 1)
         ed=case_when(educ>16 ~ ">16",educ == 16 ~ "16", educ>=13 ~ "13-15", educ>=12 ~ "12", educ<12 ~ "<12")
         ) %>%
  select(ID,educ,ed,year_meas) %>%
  #mutate(ed = factor(ed,levels = c(1,2,3,4,5),labels=c("<12","12","13-15","16",">16"))) %>%
  select(ID,ed) #<- just use this measure for now
```

## Main Interview: Labor Market Data

```{r}
G <- readxl::read_excel("../../../data/data_PSID/main/labor-market/earnings-hours.xlsx")

index1 <- read.csv("../../../data/data_PSID/main/labor-market/earnings_labels_clean.csv",header=FALSE)
names(index1) <- c("variable","label","year")
index1$label <- str_c(index1$label,'_',index1$year) 
index1 <- index1[-3]

#everything should be renamed
for (i in 1:length(G)){
  names(G)[i] = as.character(index1[["label"]][i])
}

G <- G %>% select(-contains("relnum")) %>% #dropping the relnum columns
  pivot_longer(cols=everything(),names_to = c(".value","year"),names_sep = "_") %>%
  filter(!is.na(intnum)) %>%
  mutate(year = as.integer(year))

G[c("earnspousebusiness")][is.na(G[c("earnspousebusiness")])] <- 0
#replaces NA business earnings with 0

names(G) <- c("year","intnum","hours_head","hours_spouse",
              "earn_head","earn_spouse","earn_spouseBusiness")

G <- G[,c(2,1,5,6,3,4,7)]

##crude way to remove the variables we don't want, I was checking sequentially
G <- G[!(G$year==1993 & G$hours_head==9999),] 
G <- G[!(G$year==1993 & G$hours_spouse==6730),]
G <- G[!(G$year==1993 & G$hours_spouse==9999),] 
G <- G[!(G$year==1993 & G$earn_head==99999999),] 
G <- G[!(G$year==1994 & G$hours_head==9999),]
G <- G[!(G$year==1994 & G$hours_spouse==9999),]
G <- G[!(G$year==1994 & G$earn_head==9999999),]
G <- G[!(G$year==1994 & G$earn_spouseBusiness==999999),]
G <- G[!(G$year==1994 & G$earn_spouse==9999999),]
G <- G[!(G$year==1995 & G$hours_head==7800),]
G <- G[!(G$year==1995 & G$earn_head==9999999),]
G <- G[!(G$year==1995 & G$earn_spouse==9999999),]
G <- G[!(G$year==1996 & G$hours_spouse==999999),]

earnings_panel <- G
```

Now we clean data from the T2 supplement:


Since the PSID went to a biennial format after 1997, the file *LaborFile.csv* does not have data for 1998,2000, and so on. A supplementary set of questions is asked about the family members from the previous year. These are sometimes called the "T2" supplement, which the chunk of code below cleans into a panel dataset for those missing years. These data were collected from the "Individual Data Index" from the PSID's data center tool, so they are linked to individuals already. The code below renames the variables and reshapes them into a panel. The reporting of earnings in these years is slightly different from the main questions, so we keep only observations that are reported in annual terms (the large majority). There is also an abnormally large increase in the number of individuals who report zero hours but positive earnings from 2001 and on. We have to correct for this using the T2 variables from the main interview file.

```{r}

#changes: add T2 income variables from the individual fils : available up to 2007, no calculated hourly wage after 2001
#changes: add T2 income variables from the family: merge using ID in 2009
#ifif hrs and earnings positive - use earnings : why so many zero wages?


Dt2 <- readxl::read_excel("../../../data/data_PSID/main/labor-market/T2_97_07.xlsx") %>% 
  mutate(MID = ER30001*1000 + ER30002,FID = MID) %>%
  rename(earn97 = ER33536A,earn99 = ER33627A,earn01 = ER33727A) %>%
  rename(earn97u = ER33536B,earn99u = ER33627B,earn01u = ER33727B) %>%
  rename(wage97 = ER33537O,wage99 = ER33628O,wage01 = ER33728O) %>%
  rename(ww97 = ER33536C,ww99 = ER33627C,ww01 = ER33727C) %>%
  rename(hw97 = ER33536Q,hw99 = ER33627Q,hw01 = ER33727Q)%>%
  rename(earn03 = ER33826A,earn05 = ER33926A) %>%
  rename(earn03u = ER33826B ,earn05u = ER33926B) %>%
  rename(hw03 = ER33827U, hw05 = ER33927C ) %>%
  rename(ww03 = ER33827S , ww05 = ER33927A  )



  
# let's just do these for now
D97 <- Dt2 %>% select(MID,FID,earn97,earn97u,wage97,ww97,hw97) %>%
  rename(earn=earn97,earnu=earn97u,wage=wage97,ww=ww97,hw=hw97) %>%
  mutate(year=1997)

D99 <- Dt2 %>% select(MID,FID,earn99,earn99u,wage99,ww99,hw99) %>%
  rename(earn=earn99,earnu=earn99u,wage=wage99,ww=ww99,hw=hw99) %>%
  mutate(year=1999)

D01 <- Dt2 %>% select(MID,FID,earn01,earn01u,wage01,ww01,hw01) %>%
  rename(earn=earn01,earnu=earn01u,wage=wage01,ww=ww01,hw=hw01) %>%
  mutate(year=2001)

D03 <- Dt2 %>% select(MID,FID,earn03,earn03u,ww03,hw03) %>%
  rename(earn=earn03,earnu=earn03u,ww=ww03,hw=hw03) %>%
  mutate(year=2003)

D05 <- Dt2 %>% select(MID,FID,earn05,earn05u,ww05,hw05) %>%
  rename(earn=earn05,earnu=earn05u,ww=ww05,hw=hw05) %>%
  mutate(year=2005)

# --- Now, read in the T-2 in 2001 variables from the family file
Dt01 <- read.csv("../../../data/data_PSID/main/labor-market/T2_supp.csv") %>% 
  rename(intnum = ER21002,h_earn = ER23702F1,h_ww = ER23702D3,h_hw = ER23702E8) %>%
  rename(w_earn = ER23702L4, w_ww = ER23702J6, w_hw = ER23702L2) %>%
  mutate(year = 2003)


#Doownload variables from family files separately from individual files - smaller datasets
# --- Read in the T-2 from family file in 2003
Dt03 =  readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>% #2005 family interview number , 2003 earnings
  rename(intnum = ER25002,h_earn = ER27711F1,h_ww = ER27711D3,h_hw = ER27711E8, h_earnu=ER27711F2) %>%
  rename(w_earn = ER27711L4, w_ww = ER27711J6  , w_hw = ER27711L2, w_earnu= ER27711L5) %>%
  mutate(year = 2005) %>%
  select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>%
  filter(is.na(intnum)==0)


Dt05 =  readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>% #2007 family interview number, 2005 earnings
  rename(intnum = ER36002,h_earn = ER40686F1,h_ww = ER40686D3,h_hw = ER40686E8 , h_earnu=ER40686F2) %>%
  rename(w_earn = ER40686L4, w_ww =  ER40686J6 , w_hw = ER40686L2, w_earnu= ER40686L5 ) %>%
  mutate(year = 2007) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)


# --- Read in the T-2 in 2007 for household heads and wives #2009 family interview number, 2007 earnings
Dt07 =  readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER42002 ,h_earn = ER46673,h_ww = ER46670 ,h_hw = ER46671 , h_earnu=ER46674 ) %>%
  rename(w_earn = ER46684, w_ww = ER46681   , w_hw = ER46682 , w_earnu= ER46685 ) %>%
  mutate(year = 2009) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu) %>% 
  filter(is.na(intnum)==0)


# --- Read in the T-2 in 2009 for household heads and wives #2011 family interview number, 2009 earnings
Dt09 =  readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER47302 ,h_earn = ER52074 ,h_ww = ER52071 ,h_hw = ER52072 , h_earnu= ER52075 ) %>%
  rename(w_earn = ER52085, w_ww = ER52082   , w_hw = ER52083 , w_earnu= ER52086 ) %>%
  mutate(year = 2011) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)


# --- Read in the T-2 in 2011 for household heads and wives #2013 family interview number, 2011 earnings
Dt11 =   readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER53002 ,h_earn = ER57841 ,h_ww = ER57825 ,h_hw = ER57839 , h_earnu= ER57842 ) %>%
  rename(w_earn = ER57889, w_ww = ER57873  , w_hw = ER57887  , w_earnu= ER57890 ) %>%
  mutate(year = 2013) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)




# --- Read in the T-2 in 2013 for household heads and wives #2015 family interview number, 2013 earnings
Dt13 =   readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER60002 ,h_earn = ER65021 ,h_ww = ER65005,h_hw = ER65019 , h_earnu= ER65022) %>%
  rename(w_earn = ER65069 , w_ww = ER65053  , w_hw = ER65067 , w_earnu= ER65070 ) %>%
  mutate(year = 2015) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)

# --- Read in the T-2 in 2015 for household heads and wives #2017 family interview number, 2015 earnings
Dt15 =   readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER66002,h_earn = ER71113 ,h_ww = ER71097,h_hw = ER71111, h_earnu= ER71114) %>%
  rename(w_earn = ER71161, w_ww = ER71145  , w_hw = ER71159 , w_earnu= ER71162) %>%
  mutate(year = 2017) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)

# --- Read in the T-2 in 2017 for household heads and wives #2019 family interview number, 2017 earnings
Dt17 =   readxl::read_excel("../../../data/data_PSID/main/labor-market/T2.xlsx")%>%
  rename(intnum = ER72002,h_earn = ER77135 ,h_ww = ER77119,h_hw = ER77133, h_earnu= ER77136) %>%
  rename(w_earn = ER77183 , w_ww = ER77167  , w_hw = ER77181 , w_earnu= ER77184) %>%
  mutate(year = 2019) %>% select(year, intnum, h_earn,h_ww,h_hw,h_earnu,w_earn,w_ww,w_hw,w_earnu)%>% 
  filter(is.na(intnum)==0)


D01 <- select(D01,c("MID","FID","year","earn","earnu","wage","hw","ww"))
D03 <- select(D03,c("MID","FID","year","earn","earnu","hw","ww"))
D05 <- select(D05,c("MID","FID","year","earn","earnu","hw","ww"))

d01 <- identifiers_panel %>% 
  filter(year==2003) %>% 
  mutate(MID = intnum68*1000 + pernum) %>%
  merge(Dt01) %>%
  mutate(year = 2001)

d03 <- identifiers_panel %>% 
  filter(year==2005) %>% 
  mutate(MID = intnum68*1000 + pernum) %>%
  merge(Dt03) %>%
  mutate(year = 2003)

d05 <- identifiers_panel %>% 
  filter(year==2007) %>% 
  mutate(MID = intnum68*1000 + pernum) %>%
  merge(Dt05) %>%
  mutate(year = 2005)



D01 <- merge(D01,d01)
Ih = D01$sn==1
D01[Ih,c("earn","hw","ww")] = D01[Ih,c("h_earn","h_hw","h_ww")]
Iw = D01$sn==2
D01[Iw,c("earn","hw","ww")] = D01[Iw,c("w_earn","w_hw","w_ww")]
D01 <- select(D01,c("MID","FID","year","earn","earnu","wage","hw","ww"))

D03 <- merge(D03,d03)
Ih = D03$sn==1
D03[Ih,c("earn","hw","ww")] = D03[Ih,c("h_earn","h_hw","h_ww")]
Iw = D03$sn==2
D03[Iw,c("earn","hw","ww")] = D03[Iw,c("w_earn","w_hw","w_ww")]
D03 <- D03%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D03 <- select(D03,c("MID","FID","year","earn","earnu","wage","hw","ww"))

D05 <- merge(D05,d05)
Ih = D05$sn==1
D05[Ih,c("earn","hw","ww")] = D05[Ih,c("h_earn","h_hw","h_ww")]
Iw = D05$sn==2
D05[Iw,c("earn","hw","ww")] = D05[Iw,c("w_earn","w_hw","w_ww")]
D05 <- D05%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D05 <- select(D05,c("MID","FID","year","earn","earnu","wage","hw","ww"))

#labor market outcomes 2 years ago are not reported in the individual file for 2007, so I use family file and generate MID and FID by merging with the identifiers file by family interview number
D07 <- identifiers_panel %>% 
  filter(year==2009) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt07, by = c("intnum","year")) %>%
  mutate(year = 2007)

D09 <- identifiers_panel %>% 
  filter(year==2011) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt09, by = c("intnum","year")) %>%
  mutate(year = 2009)

D11 <- identifiers_panel %>% 
  filter(year==2013) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt11, by = c("intnum","year")) %>%
  mutate(year = 2011)

D13 <- identifiers_panel %>% 
  filter(year==2015) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt13, by = c("intnum","year")) %>%
  mutate(year = 2013)

D15 <- identifiers_panel %>% 
  filter(year==2017) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt15, by = c("intnum","year")) %>%
  mutate(year = 2015)

D17 <- identifiers_panel %>% 
  filter(year==2019) %>% 
  mutate(MID = intnum68*1000 + pernum,FID=MID) %>%
  merge(Dt17, by = c("intnum","year")) %>%
  mutate(year = 2017)

Ih = D07$sn==1
D07[Ih,c("earn","hw","ww","earnu")] = D07[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D07$sn==2
D07[Iw,c("earn","hw","ww","earnu")] = D07[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D07 <- D07%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D07 <- select(D07,c("MID","FID","year","earn","earnu","wage","hw","ww"))


Ih = D09$sn==1
D09[Ih,c("earn","hw","ww","earnu")] = D09[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D09$sn==2
D09[Iw,c("earn","hw","ww","earnu")] = D09[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D09 <- D09%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D09 <- select(D09,c("MID","FID","year","earn","earnu","wage","hw","ww"))


Ih = D11$sn==1
D11[Ih,c("earn","hw","ww","earnu")] = D11[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D11$sn==2
D11[Iw,c("earn","hw","ww","earnu")] = D11[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D11 <- D11%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D11 <- select(D11,c("MID","FID","year","earn","earnu","wage","hw","ww"))


Ih = D13$sn==1
D13[Ih,c("earn","hw","ww","earnu")] = D13[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D13$sn==2
D13[Iw,c("earn","hw","ww","earnu")] = D13[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D13 <- D13%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D13 <- select(D13,c("MID","FID","year","earn","earnu","wage","hw","ww"))


Ih = D15$sn==1
D15[Ih,c("earn","hw","ww","earnu")] = D15[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D15$sn==2
D15[Iw,c("earn","hw","ww","earnu")] = D15[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D15 <- D15%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D15 <- select(D15,c("MID","FID","year","earn","earnu","wage","hw","ww"))

Ih = D17$sn==1
D17[Ih,c("earn","hw","ww","earnu")] = D17[Ih,c("h_earn","h_hw","h_ww","h_earnu")]
Iw = D17$sn==2
D17[Iw,c("earn","hw","ww","earnu")] = D17[Iw,c("w_earn","w_hw","w_ww","w_earnu")]
D17 <- D17%>% 
mutate(wage=NA_real_) #hourly wage is not calculated after 2001, will be imputed from earnings and hours
D17 <- select(D17,c("MID","FID","year","earn","earnu","wage","hw","ww"))

earnings_panel_supplement <- rbind(D97,D99,D01,D03,D05,D07,D09,D11,D13,D15,D17) %>%
  mutate(earn = na_if(na_if(na_if(na_if(earn,-9999999),-999999),99999999),9999999)) %>%
  mutate(earn = na_if(na_if(na_if(earn,-999998),9999998),99999998))%>%
  #mutate(earn = case_when(earnu==1 ~ earn*hw*ww,earnu==2 ~ earn*365,earnu==3 ~ earn*ww, earnu==4 ~ earn*ww/2,earnu==5 ~ earn*12, earnu==6 ~ earn, TRUE ~ NA_real_)) %>%
  mutate(ww = na_if(na_if(ww,99),98),hw=na_if(na_if(hw,999),998)) %>%
  mutate(hrs = ww*hw) %>%
  #mutate(wage2 = case_when(earn>0 & hrs>0 ~ earn/hrs,TRUE ~ NA_real_),0) %>%
  mutate(wage = case_when(wage<=0 ~ NA_real_,wage==999 ~ NA_real_,TRUE ~ wage)) %>%
  mutate(wage = case_when(earn>0 & earnu==6 & hrs>0 & is.na(wage) ~ earn/hrs, TRUE ~ wage))
```

## Main Interview: Total Income

```{r}

```{r}

Income_ly <- readxl::read_excel("../../../data/data_PSID/main/total_income/total_income.xlsx") %>%
rename(intnum97=ER10002, tax_income97=ER12069, total_income97=ER12079) %>%
  rename(intnum99=ER13002, tax_income99=ER16452, total_income99=ER16462) %>%
  rename(intnum01=ER17002, tax_income01=ER20449, total_income01=ER20456) %>%
  rename(intnum03=ER21002, tax_income03=ER24100, total_income03=ER24099) %>%
  rename(intnum05=ER25002, tax_income05=ER27953, total_income05=ER28037) %>%
  rename(intnum07=ER36002, tax_income07=ER40943, total_income07=ER41027)
  

Income_T2 <- readxl::read_excel("../../../data/data_PSID/main/total_income/total_income.xlsx") %>%
rename(intnum99=ER13002, total_income97=ER16219) %>%
rename(intnum01=ER17002, total_income99=ER20165) %>%
rename(intnum03=ER21002, total_income01=ER23764)


# Separate files by year
I97 <- Income_ly %>% select(intnum97,tax_income97,total_income97) %>%
  rename(intnum=intnum97,tax_income=tax_income97,total_income=total_income97) %>%
  mutate(year=1997)

I99 <- Income_ly %>% select(intnum99,tax_income99,total_income99) %>%
  rename(intnum=intnum99,tax_income=tax_income99,total_income=total_income99) %>%
  mutate(year=1999)

I99_T2 <- Income_T2 %>% select(intnum99,total_income97) %>%
  rename(intnum=intnum99,total_income=total_income97) %>%
  mutate(year=1999, tax_income=NA_real_)


I01 <- Income_ly %>% select(intnum01,tax_income01,total_income01) %>%
  rename(intnum=intnum01,tax_income=tax_income01,total_income=total_income01) %>%
  mutate(year=2001)

I01_T2 <- Income_T2 %>% select(intnum01,total_income99) %>%
  rename(intnum=intnum01,total_income=total_income99) %>%
  mutate(year=2001, tax_income=NA_real_)

I03 <- Income_ly %>% select(intnum03,tax_income03,total_income03) %>%
  rename(intnum=intnum03,tax_income=tax_income03,total_income=total_income03) %>%
  mutate(year=2003)

I03_T2 <- Income_T2 %>% select(intnum03,total_income01) %>%
  rename(intnum=intnum03,total_income=total_income01) %>%
  mutate(year=2003, tax_income=NA_real_)

I05 <- Income_ly %>% select(intnum05,tax_income05,total_income05) %>%
  rename(intnum=intnum05,tax_income=tax_income05,total_income=total_income05) %>%
  mutate(year=2005)

I07 <- Income_ly %>% select(intnum07,tax_income07,total_income07) %>%
  rename(intnum=intnum07,tax_income=tax_income07,total_income=total_income07) %>%
  mutate(year=2007)




i97 <- identifiers_panel %>% 
  filter(year==1997) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I97) %>%
  mutate(year = 1996)

i99 <- identifiers_panel %>% 
  filter(year==1999) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I99) %>%
  mutate(year = 1998)

i99_T2 <- identifiers_panel %>% 
  filter(year==1999) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I99_T2) %>%
  mutate(total_income=na_if(na_if(total_income,9999999),9999998))%>%
  mutate(year = 1997)

i01 <- identifiers_panel %>% 
  filter(year==2001) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I01) %>%
  mutate(year = 2000)

i01_T2 <- identifiers_panel %>% 
  filter(year==2001) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I01_T2) %>%
    mutate(total_income=na_if(na_if(total_income,999999999),999999998))%>%
  mutate(year = 1999)

i03 <- identifiers_panel %>% 
  filter(year==2003) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I03) %>%
  mutate(year = 2002)

i03_T2 <- identifiers_panel %>% 
  filter(year==2003) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I03_T2) %>%
    mutate(total_income=na_if(na_if(total_income,999999999),999999998))%>%
  mutate(year = 2001)

i05 <- identifiers_panel %>% 
  filter(year==2005) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I05) %>%
  mutate(year = 2004)

i07 <- identifiers_panel %>% 
  filter(year==2007) %>% 
  mutate(X = NULL,MID = intnum68*1000 + pernum) %>%
  select(MID, year, intnum)%>%
  merge(I07) %>%
  mutate(year = 2006)


total_income <- rbind(i97,i99,i99_T2,i01,i01_T2,i03,i03_T2,i05,i07) %>%
  select(-intnum)
```

# Arranging PSID-CDS Data

## Assessments

Arrange the child assessments into a panel.

```{r}

D <- readxl::read_excel("../../../data/data_PSID/cds/assessments/ChildAssessments.xlsx") %>% 
  mutate(KID = ER30001*1000 + ER30002)

# first we want to arrange the data nicely. We have the same scores for each year. They are:
col_names <- c("LW_std","LW_raw","PC_std","PC_raw","AP_std","AP_raw","DigSpan","BPE","BPN")

# 3 vectors with the corresponding variable names for each year
names97 <- c("Q3LW_SS","Q3LWRAW","Q3PC_SS","Q3PCRAW","Q3AP_SS","Q3APRAW","Q3DSTOT","BPI_E97","BPI_N97")

names02 <- c("Q24LWSS","Q24LWRAW","Q24PCSS","Q24PCRAW","Q24APSS","Q24APRAW","Q24DSTO","BPI_E02","BPI_N02")


names07 <- c("Q34LWSS","Q34LWRAW","Q34PCSS","Q34PCRAW","Q34APSS","Q34APRAW","Q34DSTO","BPI_E07","BPI_N07")

# separate and recombine files in a panel
D97 <- D[,c("KID",names97)]
names(D97)[2:10] <- col_names
D97$year <- 1997

D02 <- D[,c("KID",names02)]
names(D02)[2:10] <- col_names
D02$year <- 2002

D07 <- D[,c("KID",names07)]
names(D07)[2:10] <- col_names
D07$year <- 2007

D <- rbind(D97,D02,D07) %>%
  as.data.frame()

# Finally, replace missing values
missvals <- c(999,99,999,99,999,99,99,99,99)
for (i in 1:9) {
  D[,col_names[i]] <- na_if(D[,col_names[i]],missvals[i])
}

write.csv(D,"../../../data/data_PSID/cds/assessments/AssessmentPanel.csv")
assessment_panel <- D

```

Assemble a measure of the PCG's passage comprehension score.

```{r}
m2 <- readxl::read_excel("../../../data/data_PSID/cds/assessments/PCG-PC.xlsx") %>% 
  mutate(KID = ER30001*1000 + ER30002) %>% 
  select(KID,Q1PCSS,Q1PCRAW)%>%
  rename(m_pc_st= Q1PCSS, m_pc_raw= Q1PCRAW)

# load the auxiliary data and create the Kid ID and the PCG ID.
# then merge by Kid ID with the PCG's test score data
# then drop missing observations
cds <- readxl::read_excel("../../../data/data_PSID/CDS-aux-info.xlsx") %>%
  mutate(KID = CDSCUMID68*1000 + CDSCUMPN,MID = ID68PCG97*1000 + PNPCG97) %>%
  select(KID,MID) %>%
  merge(m2) %>%
  filter(m_pc_st>0) %>%
  select(MID,m_pc_st,m_pc_raw) %>%
  unique()

# there are a small handful of observations where the raw score is the same but the conversion to standardized score differs slightly, in this case we just keep the first observation
cds <- cds %>%
  group_by(MID) %>%
  filter(row_number()==1)

passage_comprehension <- cds

```

## Time Diaries

First load the 1997 and 2002 time diaries, clean them, and rename the files so they are consistent across those years. The coding manual for time use activities are [here](../../../data/data_PSID/docs/codingman.pdf) and [here.](../../../data/data_PSID/docs/2003codebook.pdf). Inspecting these you will see that the code categories are slightly more fine in the 2002 wave (one extra digit). So for example a category coded as 919 in 1997 becomes 9190 in 2002 or potentially 919$x$ for some $x\in\{1,...,9\}$ depending on the sub-category.

For a consistent sent of categories, we use the 1997 activity codes and round down to the nearest multiple of 10 for 2002 categories. Here is a summary of the variables:

Variable      Description
------------  -------------------------------------- 
COLA          Activity Code
DURATION      Duration of activity in seconds
COLG_B        Mother participating in activity
COLG_C        Father participating in activity
COLH_B        Mother around but not participating
COLH_C        Father around but not participating
COLG_I        Grandparent or great-grandparent participating in activity
COLG_J        Other relative participating in activity
COLG_K        Other non-relative participating in activity
COLH_I        Grandparent or great-grandparent around but not participating
COLH_J        Other relative around but not participating
COLH_K        Other non-relative around but not participating
COLF          Where was Child? (40 = babysitter,home-based care,neighbors, 89 = center based care)
WDAYWEND      Indicates weekday or weekend

The chunk below imports the data, appends both waves into one dataset (after renaming), and fills in some missing values.

```{r}
# 300-399 is obtaining goods/services (remove?)
# 919 tv
T0 <- read.csv("../../../data/data_PSID/cds/time-diary/TD97full.csv") %>%
  mutate(KID=ER30001*1000 + ER30002,year = 1997) %>%
  select(KID,year,COLA,COLG_B,COLG_C,COLG_I,COLG_J,COLG_K,COLH_B,COLH_C,COLH_I,COLH_J,COLH_K,COLF,WDAYWEND,DURATION, COLJ)



T1 <- read.csv("../../../data/data_PSID/cds/time-diary/TD02full.csv") %>%
  mutate(KID=ER30001*1000 + ER30002,year = 2002) %>%
  rename(COLG_B = COLGB_02, COLG_C = COLGC_02 , COLG_I = COLGI_02,COLG_J = COLGJ_02,COLG_K = COLGK_02, COLH_B = COLHB_02, COLH_C = COLHC_02, COLH_I = COLHI_02,COLH_J = COLHJ_02,COLH_K = COLHK_02, WDAYWEND = DIARY_02, DURATION = DUR_02, COLA = COLA_02,COLF = COLF_02, COLJ=COLJ_02) %>%
  select(KID,year,COLA,COLG_B,COLG_C,COLG_I,COLG_J,COLG_K,COLH_B,COLH_C,COLH_I,COLH_J,COLH_K,COLF,WDAYWEND,DURATION, COLJ) %>%
  mutate(COLA = floor(COLA/10)) #<- have to round down

T2 <- read.csv("../../../data/data_PSID/cds/time-diary/TD07full.csv")%>%
  mutate(KID=ER30001*1000 + ER30002,year = 2007) %>%
  rename(COLG_B = COLHB_07, COLG_C = COLHC_07 , COLH_B = COLIB_07, COLH_C = COLIC_07, WDAYWEND = DIARY_07, DURATION = DUR_07, COLA = COLA_07, 
         COLG_I= COLHI_07  , COLG_J= COLHJ_07, COLG_K= COLHK_07, COLH_I= COLII_07 ,COLH_J= COLIJ_07,COLH_K= COLIK_07,COLF = COLG_07, COLJ=COLJ_07) %>%
   select(KID,year,COLA,COLG_B,COLG_C,COLG_I,COLG_J,COLG_K,COLH_B,COLH_C,COLH_I,COLH_J,COLH_K,COLF,WDAYWEND,DURATION, COLJ) %>%
  mutate(COLG_B = if_else(COLG_B==5, as.integer(0), as.integer(COLG_B)),COLG_C = if_else(COLG_C==5, as.integer(0), as.integer(COLG_C)),
         COLH_B = if_else(COLH_B==5, as.integer(0), as.integer(COLH_B)), COLH_C = if_else(COLH_C==5, as.integer(0), as.integer(COLH_C)),#<- 5 corresponds to activity being personal, private, school, or work according to the codebook - I make it 0
         COLA = floor(COLA/10), COLJ = floor(COLJ/10)) #<- have to round down

TD <- rbind(T0,T1,T2) %>%
  mutate(time2ind = COLA!=919) %>% #<-
  mutate(COLG_B = na_if(COLG_B,9),COLG_C = na_if(COLG_C,9),COLH_B = na_if(COLH_B,9), COLH_C = na_if(COLH_C,9),COLG_I = na_if(COLG_I,9),COLG_J = na_if(COLG_J,9),COLH_K = na_if(COLH_K,9),COLH_I = na_if(COLH_I,9),COLH_J = na_if(COLH_J,9),COLH_K = na_if(COLH_K,9)) #<- code in missing categories



```

Next, we define a few categories of time use that we will add up for each individual. The comments below describe what each defined variable captures.

```{r}

#create a dummy variable equal to 1 if parents are engaged into the "investment" activity

TD$investment=ifelse(TD$COLA==258|TD$COLA==222|TD$COLA==249|TD$COLA==259|TD$COLA==238|
                       TD$COLA==339|TD$COLA==411|TD$COLA==488|TD$COLA==501|TD$COLA==504|
                       TD$COLA==511|TD$COLA==519|TD$COLA==549|TD$COLA==569|
                       (TD$COLA>=709&TD$COLA<=749)|(TD$COLA>=801&TD$COLA<=889)|
                       TD$COLA==939|TD$COLA==941|TD$COLA==959|TD$COLA==942|
                       TD$COLA==943|TD$COLA==962|TD$COLA==967|TD$COLA==979|TD$COLA==963,1,0)

#create an dummy variable equal to 1 if parents are engaged into the "socializing" activity
TD$social=ifelse(TD$COLA==439|TD$COLA==448|TD$COLA==449|(TD$COLA>=601&TD$COLA<=689)|
                   TD$COLA==752|TD$COLA==769|TD$COLA==771|TD$COLA==772|TD$COLA==789,1,0)

#create expanded investment and social dummy variables


TD$ex_soc=ifelse(TD$COLA==108|TD$COLA==109|TD$COLA==118|TD$COLA==119|TD$COLA==698|TD$COLA==699|TD$COLA==799|TD$social==1,1,0)


TD$ex_inv=ifelse(TD$investment==1|
                   TD$COLA==248|TD$COLA==221|TD$COLA==239|TD$COLA==377|
                   TD$COLA==484|TD$COLA==510|TD$COLA==512|TD$COLA==599|
                   TD$COLA==597|TD$COLA==598|TD$COLA==899,1,0)

# a dummy if no parent is around 
TD$no_parent = (TD$COLG_B==0) & (TD$COLG_C==0) & (TD$COLH_B==0) & (TD$COLH_C==0)


# create a formal, non-relative care variable
TD$formal = TD$no_parent & (TD$COLF==84)

# create a relative care variable
TD$relative = TD$no_parent & (TD$COLG_I==1 | TD$COLG_J==1 | TD$COLH_I==1 | TD$COLH_J==1)
# create a relative present variable
TD$relative_present = (TD$COLG_I==1 | TD$COLG_J==1 | TD$COLH_I==1 | TD$COLH_J==1)


# create an informal non-relative care variable. Defined as time in non-relative care that does not occur in a formal center and does not occur at school

TD$informal = TD$no_parent & !TD$relative & (TD$COLF!=84) & (TD$COLF!=80) & (TD$COLG_K==1 | TD$COLH_K==1)

```


We want a file that for each year and child, contains the total active time for mothers and fathers. Eventually, we also want to look at this disaggregated. We also want to look at total time invested.

When aggregating to a weekly estimate, we'll add a weight of 5 to weekdays, and 2 to weekends.

```{r}
TD$wght = 2 + 3*TD$WDAYWEND
```

Now we create the means of active time, measured in hours per week. Our main measures, *tau_m* and *tau_f*, sum up all activity times with parents actively participating. The alternatives *tau_m2* and *tau_f2* do the same thing but exclude TV watching from the list of categories. The other investment categories are defined in the code chunk above.

```{r}
TD_agg <- TD %>% 
  group_by(KID,year) %>% 
  summarise(tau_m = sum(DURATION*wght*COLG_B,na.rm = TRUE)/3600, tau_f = sum(DURATION*wght*COLG_C,na.rm=TRUE)/3600, tau_both = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C),na.rm=TRUE)/3600, tau_m2 = sum(time2ind*DURATION*wght*COLG_B,na.rm = TRUE)/3600, tau_f2 = sum(time2ind*DURATION*wght*COLG_C,na.rm=TRUE)/3600,
            tau_m_investment = sum(DURATION*wght*COLG_B*investment,na.rm = TRUE)/3600,  tau_m_social = sum(DURATION*wght*COLG_B*social,na.rm = TRUE)/3600, tau_m_ex_inv = sum(DURATION*wght*COLG_B*ex_inv,na.rm = TRUE)/3600,  tau_m_ex_soc = sum(DURATION*wght*COLG_B*ex_soc,na.rm = TRUE)/3600, 
            tau_f_investment = sum(DURATION*wght*COLG_C*investment,na.rm = TRUE)/3600,  tau_f_social = sum(DURATION*wght*COLG_C*social,na.rm = TRUE)/3600, tau_f_ex_inv = sum(DURATION*wght*COLG_C*ex_inv,na.rm = TRUE)/3600, tau_f_ex_soc = sum(DURATION*wght*COLG_C*ex_soc,na.rm = TRUE)/3600,
            tau_both_investment = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C)*investment,na.rm = TRUE)/3600,  tau_both_social = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C)*social,na.rm = TRUE)/3600, tau_both_ex_inv = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C)*ex_inv,na.rm = TRUE)/3600, tau_both_ex_soc = sum(DURATION*wght*(COLG_B + (1-COLG_B)*COLG_C)*ex_soc,na.rm = TRUE)/3600,
            formal = sum(DURATION*wght*formal,na.rm = TRUE)/3600,informal = sum(DURATION*wght*informal,na.rm = TRUE)/3600,relative = sum(DURATION*wght*relative,na.rm = TRUE)/3600,relative_present = sum(DURATION*wght*relative_present,na.rm = TRUE)/3600)


#Create expanded social and expanded investment measure
TD_agg <- TD_agg %>% 
  mutate(tau_m_socinvest=tau_m_ex_inv+tau_m_ex_soc,
         tau_f_socinvest=tau_f_ex_inv+tau_f_ex_soc,
         tau_both_socinvest=tau_both_ex_inv+tau_both_ex_soc)

```

Read the frequency of gatherings with family and friends.

```{r}
fr97 <- readxl::read_excel("../../../data/data_PSID/cds/time-diary/family_friends/family_friends.xlsx") %>% 
  mutate(KID=ER30001*1000 + ER30002,year = 1997, Q1B7=case_when(Q1B7>=8 ~ NA_real_, TRUE ~Q1B7),
         live_grand=case_when(BIOGPR97==1 ~ 1,BIOGPR97==2 ~ 0,BIOGPR97==3 ~ 0, TRUE ~ NA_real_)) %>%
  select(KID, year,Q1B7,live_grand)%>%
  rename(freq_family=Q1B7)

#use grandparents living/not living with child in 2001 for 2002 (2003 is also available)
fr02 <- readxl::read_excel("../../../data/data_PSID/cds/time-diary/family_friends/family_friends.xlsx") %>% 
  mutate(KID=ER30001*1000 + ER30002,year = 2002, Q21E6=case_when(Q21E6>=8 ~ NA_real_, TRUE ~Q21E6),
        live_grand=case_when(BIOGPR01==1 ~ 1,BIOGPR01==2 ~ 0,BIOGPR01==3 ~ 0,BIOGPR01==4 ~ 0, TRUE ~ NA_real_)) %>%
  select(KID, year,Q21E6,live_grand)%>%
  rename(freq_family=Q21E6)

fr07 <- readxl::read_excel("../../../data/data_PSID/cds/time-diary/family_friends/family_friends.xlsx") %>% 
  mutate(KID=ER30001*1000 + ER30002,year = 2007, Q31E6=case_when(Q31E6>=8 ~ NA_real_, TRUE ~Q31E6),
              live_grand=case_when(BIOGPR07==1 ~ 1,BIOGPR07==2 ~ 0,BIOGPR07==3 ~ 0,BIOGPR07==4 ~ 0, TRUE ~ NA_real_)) %>%
  select(KID, year,Q31E6,live_grand)%>%
  rename(freq_family=Q31E6)

#stack into a panel
fr <- rbind(fr97,fr02,fr07)

#merge with panel of time
TD_agg <- TD_agg %>%
  left_join(fr)

```

## Expenditures


```{r}

E <- read.csv("../../../data/data_PSID/cds/expenditures/Exp_0207.csv")
#generate kid's ID
E <- E %>%
  mutate(KID = ER30001*1000 + ER30002)

#First select kids in 2002 sample and variables describing 2002 hh expenditures

E02<-E[E$KID02==1.1,] %>%select(KID,starts_with("Q21"))
#Substitute to missing if the primary caregiver did not know about costs
E02[(E02$Q21H23A>=8|is.na(E02$Q21H23A)==1|E02$Q21H23A1>=99998),"Q21H23A1"]<-NA
E02[(E02$Q21H23B>=8|is.na(E02$Q21H23B)==1|E02$Q21H23B1>=99998),"Q21H23B1"]<-NA
E02[(E02$Q21H23C>=8|is.na(E02$Q21H23C)==1|E02$Q21H23C1>=9998),"Q21H23C1"]<-NA
E02[(E02$Q21H23D>=8|is.na(E02$Q21H23D)==1|E02$Q21H23D1>=9998),"Q21H23D1"]<-NA
E02[(E02$Q21H23H>=8|is.na(E02$Q21H23H)==1|E02$Q21H23H1>=99998),"Q21H23H1"]<-NA

E02 <- E02 %>%
  rename(Toys = Q21H23A1, Vacation = Q21H23B1, SchSupplies = Q21H23C1, Clothing = Q21H23D1, Food = Q21H23H1) %>%
  mutate(Toys = Toys/52, Vacation = Vacation/52, SchSupplies = SchSupplies/52, Clothing = Clothing/52, Food = Food/52)


#Substitute to missing if the primary caregiver did not know about costs
E02$Q21G5D[(E02$Q21G5D>=9998)] <- NA
E02$Q21H5C[(E02$Q21H5C>=9998)] <- NA
E02$Q21G10C[(E02$Q21G10C>=9998)] <- NA
E02$Q21H8B[(E02$Q21H8B>=9998)] <- NA
E02$Q21G8I[(E02$Q21G8I>=9998)] <- NA
E02$Q21H7F[(E02$Q21H7F>=9998)] <- NA
E02$Q21G7C [(E02$Q21G7C >=9998)] <- NA
E02$Q21H6B[(E02$Q21H6B>=9998)] <- NA
E02$Q21B12A1[(E02$Q21B12A1>=99998)] <- NA


#Combine expenditure on children in diffenet age groups, generate annual tuition cost
E02 <- E02 %>%
  mutate(year = 2002 ) %>%
  mutate(tutoring = Q21G5D + Q21H5C) %>%
  mutate(comm_grps = (Q21G10C + Q21H8B)) %>%
  mutate(sports = (Q21G8I + Q21H7F)) %>%
  mutate(lessons = (Q21G7C + Q21H6B)) %>%
  mutate(tuition = Q21B12A1)%>%
  mutate(tuition = case_when(Q21B12A2==3 ~ 50*as.numeric(tuition),Q21B12A2==5 ~ 12*as.numeric(tuition), TRUE ~ as.numeric(tuition)))%>%
  select(KID, year, tuition, lessons, tutoring, comm_grps, sports, Toys, Vacation, SchSupplies, Clothing, Food)
    

#Select kids in 2007 sample and variables describing 2007 hh expenditures

E07<-E[E$KID07==1.1,] %>%select(KID,starts_with("Q31"))
#Substitute to missing if the primary caregiver did not not about costs
E07[(E07$Q31H23A>=8|is.na(E07$Q31H23A)==1|E07$Q31H23A1>=99998),"Q31H23A1"]<-NA
E07[(E07$Q31H23B>=8|is.na(E07$Q31H23B)==1|E07$Q31H23B1>=99998),"Q31H23B1"]<-NA
E07[(E07$Q31H23C>=8|is.na(E07$Q31H23C)==1|E07$Q31H23C1>=99998),"Q31H23C1"]<-NA
E07[(E07$Q31H23D>=8|is.na(E07$Q31H23D)==1|E07$Q31H23D1>=99998),"Q31H23D1"]<-NA
E07[(E07$Q31H23H>=8|is.na(E07$Q31H23H)==1|E07$Q31H23H1>=99998),"Q31H23H1"]<-NA

E07 <- E07 %>%
  rename(Toys = Q31H23A1, Vacation = Q31H23B1, SchSupplies = Q31H23C1, Clothing = Q31H23D1, Food = Q31H23H1) %>%
  mutate(Toys = Toys/52, Vacation = Vacation/52, SchSupplies = SchSupplies/52, Clothing = Clothing/52, Food = NA)



E07$Q31H5C[(E07$Q31H5C>=9998)] <- NA
E07$Q31H8B[(E07$Q31H8B>=9998)] <- NA
E07$Q31H7F[(E07$Q31H7F>=9998)] <- NA
E07$Q31H6B[(E07$Q31H6B>=9998)] <- NA
E07$Q31B12A1[(E07$Q31B12A1>=99998)] <- NA

#In 2007 children the information is collected for children 10+, generate annual tuition cost
E07 <- E07 %>%
  mutate(year = 2007) %>%
  mutate(tutoring = Q31H5C) %>%
  mutate(comm_grps = Q31H8B) %>%
  mutate(sports = Q31H7F) %>%
  mutate(lessons = Q31H6B) %>%
  mutate(tuition = Q31B12A1)%>%
  mutate(tuition = case_when(Q31B12A2==3 ~ 50*as.numeric(tuition),Q31B12A2==5 ~ 12*as.numeric(tuition), TRUE ~ as.numeric(tuition)))%>%
  select(KID,year,Toys,Vacation,SchSupplies,Clothing,Food,sports,lessons,comm_grps,tutoring,tuition)

#Merge 2002 and 2007 expenditure data, transform annual expenditures to weekly

E <- rbind(E07, E02)
E=E%>%mutate(tuition=tuition/52, 
         comm_grps=comm_grps/52,
         lessons=lessons/52,
         tutoring=tutoring/52,
         sports=sports/52)

#Drop children with missing ID
E <- E[is.na(E$KID)==0,]

cols <- c("KID","year","Toys","Vacation","SchSupplies","Clothing","Food","sports","lessons","comm_grps","tutoring","tuition")

cds_expenditures <- E[,cols]


```



### Per Pupil Expenditures


When possible, as part of the CDS supplement, the PSID conducted interviews with the Middle and Elementary School administrators. The data below cleans all available data on reported per pupil expenditure for the school.

```{r}
PPE <- readxl::read_excel("../../../data/data_PSID/cds/expenditures/CDS_PPE.xlsx") %>%
  mutate(KID = ER30001*1000 + ER30002) %>%
  rename(PPE = Q12A27) %>%
  mutate(PPE = case_when(PPE<99998 ~ PPE)) %>% #<- code in missing values
  select(KID,PPE) %>%
  mutate(year = 1997) #<- add the year for merging into a panel.
```

## Childcare Expenditures

In 2002/2007, the interview distinguishes between weekday and weekend care options (which the 1997 wave does not). Since there were very few responses to the weekend care question, we are just using weekday care. In addition, since there were very few responses given beyond the 1st care option, we are only using those data.

For convenience the 1997 variables are that we use are tabulated below.

Vname      Variable Description
---------- -----------------------------------------
ER30001    1968 INTERVIEW NUMBER                   
ER30002    PERSON NUMBER                         68
ER33401    1997 INTERVIEW NUMBER                   
Q1H14      MOST USED ARRNGMNT 97                   
Q1H15      2ND ARRNGMNT MOST USED 97               
Q1H18      USUAL ARRNGMNT-DAYS PER WEEK 97         
Q1H19      USUAL ARRNGMNT-HRS PER WEEK 97          
Q1H21      USUAL ARRNGMNT-COST 97                  
Q1H21A     USUAL ARRNGMNT-PAYMENT RATE 97          
Q1H22      USUAL ARRNGMNT-OTHER CHILDREN 97        
Q1H22A     USUAL ARRNGMNT-# OF OTHER CHILDREN 97   
Q1H24      2ND ARRNGMNT-DAYS PER WEEK 97           
Q1H25      2ND ARRNGMNT-HRS PER WEEK 97            
Q1H27      2ND ARRNGMNT-COST 97                    
Q1H27A     2ND ARRNGMNT-PAYMENT RATE 97            
Q1H28      2ND ARRNGMNT-OTHER CHILDREN 97          
Q1H28A     2ND ARRNGMNT-# OF OTHER CHILDREN 97     
   

For now, let's just create data on expenditures for each year. In 1997, the cost was reported in one of several units: (1) hourly (2) daily (3) weekly (4) fortnightly (5) monthly (6) annual. We standardize this into a weekly measure. We also derive an hourly price using the weekly cost and hours per week, as well as an indicator for the type of arrangement being used.

```{r}
Ch97 <- readxl::read_excel("../../../data/data_PSID/cds/childcare/chcare_9707.xlsx") %>% #<- read in the data
  mutate(KID = ER30001*1000 + ER30002) %>%
  mutate(Q1H21 = case_when(Q1H21<9998 ~ Q1H21),Q1H18 = case_when(Q1H18<8 ~ Q1H18),Q1H19=case_when(Q1H19<998 ~ Q1H19),Q1H21A=case_when(Q1H21A<8 ~ Q1H21A)) %>% #<- fill in missing values 
  mutate(Q1H27 = case_when(Q1H27<9998 ~Q1H27 ),Q1H24= case_when(Q1H24<8 ~ Q1H24),Q1H25=case_when(Q1H25<98 ~ Q1H25),Q1H27A=case_when(Q1H27A<8 ~ Q1H27A)) %>% #<- fill in missing values for second arrangement
  mutate(chcareExp97 = case_when(Q1H21A==1 ~ Q1H21*Q1H19, #, #<- cost reported hourly
                                 Q1H21A==2 ~ Q1H21*Q1H18,  #<- reported daily
                                 Q1H21A==4 ~ Q1H21/2, #<- reported fortnightly
                                 Q1H21A==5 ~ Q1H21/(52/12), #<- reported monthly
                                 Q1H21A==6 ~ Q1H21/52, #<-reported annually
                                 TRUE ~ Q1H21)) %>% #<- reported weekly
    mutate(chcareExpSecond97 = case_when(Q1H27A==1 ~ Q1H27*Q1H25, #, #<- cost reported hourly
                                 Q1H27A==2 ~ Q1H27*Q1H24,  #<- reported daily
                                 Q1H27A==4 ~ Q1H27/2, #<- reported fortnightly
                                 Q1H27A==5 ~ Q1H27/(52/12), #<- reported monthly
                                 Q1H27A==6 ~ Q1H27/52, #<-reported annually
                                 TRUE ~ Q1H27)) %>% #<- reported weekly
  mutate(Price97 = chcareExp97/Q1H19) %>% #<- derive price
  mutate(PriceSecond97 = chcareExpSecond97/Q1H25) %>% #<- derive price
  mutate(Arrange97 = case_when(Q1H14==1 | Q1H14==2 | Q1H14==4 ~ "Relative",
                               Q1H14==3 ~ "Non Relative",
                               Q1H14==5 ~ "Family Center",
                               Q1H14==7 ~ "Care Center",
                               Q1H14==8 ~ "Before/After School Prog."))%>% 
  mutate(ArrangeSecond97 = case_when(Q1H15==1 | Q1H15==2 | Q1H15==4 ~ "Relative",
                               Q1H15==3 ~ "Non Relative",
                               Q1H15==5 ~ "Family Center",
                               Q1H15==7 ~ "Care Center",
                               Q1H15==8 ~ "Before/After School Prog."))

```

The chunk of code below does the same thing for the 2002 PCG interview. For reference, here are the variables.

Vname      Variable Description
---------- ----------------------------------------
ER33601    2001 INTERVIEW NUMBER                   
ER33602    SEQUENCE NUMBER                       01
ER33603    RELATION TO HEAD                      01
CHREL      PCG CHILD FILE RELEASE NUMBER 02        
Q21C12     WKDAY: CARE USED MOST 02                
Q21C13     WKDAY: HRS PER WK 02                    
Q21C15A    WKDAY: COST OF CARE - AMT 02            
Q21C15B    WKDAY: COST OF CARE - UNIT 02           
ER33901    2007 INTERVIEW NUMBER                   
ER33902    SEQUENCE NUMBER                       07
ER33903    RELATION TO HEAD                      07
PCHREL07   PCG CHILD FILE RELEASE NUMBER 07        
Q31C12     WKDAY: CARE USED MOST 07                
Q31C13     WKDAY: HRS PER WK 07                    
Q31C15A    WKDAY: COST OF CARE - AMT 07            
Q31C15B    WKDAY: COST OF CARE - UNIT 07           

Unlike in 1997, there is no variable that reports the number of days per week. Therefore, to impute the weekly cost when a daily rate is reported, we use the average number of hours per day reported among those that use an hourly rate, and combine this with the hours per week report to get the number of days.

From 1997, we calculate the average number of hours per day as:
```{r}
hours_per_day <- Ch97 %>% filter(Q1H18>0) %>% summarise(mean(Q1H19/Q1H18,na.rm = TRUE))
hours_per_day[1,1]
```
So we get that childcare is used on average for about 3.1 hours per day. The imputed number of days per week is therefore

\[Days/Week = (Hours/Week)/(Hours/Day) = (Hours/Week)/3.1\]


```{r}

Ch02 <- readxl::read_excel("../../../data/data_PSID/cds/childcare/chcare_9707.xlsx") %>%
    mutate(KID = ER30001*1000 + ER30002) %>%
  mutate(Q21C15A = case_when(Q21C15A<9998 ~ Q21C15A),Q21C13=case_when(Q21C13<998 ~ Q21C13),Q21C15B=case_when(Q21C15B<8 ~ Q21C15B)) %>% #<- fill in missing values 
  mutate(Q21C21A = case_when(Q21C21A<998 ~ Q21C21A),Q21C19=case_when(Q21C19<998 ~ Q21C19),Q21C21B=case_when(Q21C21B<8 ~ Q21C21B)) %>% #<- fill in missing values for second childcare measure
  mutate(chcareExp02 = case_when(Q21C15B==1 ~ Q21C15A*Q21C13, #<- cost reported hourly
                                 Q21C15B==2 ~ Q21C15A*Q21C13/3.1, #<- reported daily?
                                 Q21C15B==4 ~ Q21C15A/2, #<- reported fortnightly
                                 Q21C15B==5 ~ Q21C15A/(52/12), #<- reported monthly
                                 Q21C15B==6 ~ Q21C15A/52,#<-reported annually
                                 TRUE ~ Q21C15A)) %>% #<- reported weekly
    mutate(chcareExpSecond02 = case_when(Q21C21B==1 ~ Q21C21A*Q21C19, #<- cost reported hourly
                                 Q21C21B==2 ~ Q21C21A*Q21C19/3.1, #<- reported daily?
                                 Q21C21B==4 ~ Q21C21A/2, #<- reported fortnightly
                                 Q21C21B==5 ~ Q21C21A/(52/12), #<- reported monthly
                                 Q21C21B==6 ~ Q21C21A/52,#<-reported annually
                                 TRUE ~ Q21C21A)) %>% #<- reported weekly
  mutate(Price02 = chcareExp02/Q21C13) %>% #<- derive hourly price
  mutate(PriceSecond02 = chcareExpSecond02/Q21C19) %>% #<- derive hourly price
  mutate(Arrange02 = case_when(Q21C12==1 | Q21C12==2  ~ "Relative",
                               Q21C12==3 ~ "Non Relative",
                               Q21C12==5 ~ "Family Center",
                               Q21C12==7 ~ "Care Center",
                               Q21C12==8 ~ "Before/After School Prog."))%>%
  mutate(ArrangeSecond02 = case_when(Q21C18==1 | Q21C18==2  ~ "Relative",
                               Q21C18==3 ~ "Non Relative",
                               Q21C18==5 ~ "Family Center",
                               Q21C18==7 ~ "Care Center",
                               Q21C18==8 ~ "Before/After School Prog."))



Ch07 <- readxl::read_excel("../../../data/data_PSID/cds/childcare/chcare_9707.xlsx") %>%
    mutate(KID = ER30001*1000 + ER30002) %>%
  mutate(Q31C15A = case_when(Q31C15A<9998 ~ Q31C15A),Q31C13=case_when(Q31C13<998 ~ Q31C13),Q31C15B=case_when(Q31C15B<8 ~ Q31C15B)) %>% #<- fill in missing values 
  mutate(Q31C21A = case_when(Q31C21A<998 ~ Q31C21A),Q31C19=case_when(Q31C19<998 ~ Q31C19),Q31C21B=case_when(Q31C21B<8 ~ Q31C21B)) %>% #<- fill in missing values for second childcare measure
  mutate(chcareExp07 = case_when(Q31C15B==1 ~ Q31C15A*Q31C13, #<- cost reported hourly
                                 Q31C15B==2 ~ Q31C15A*Q31C13/3.1, #<- reported daily?
                                 Q31C15B==4 ~ Q31C15A/2, #<- reported fortnightly
                                 Q31C15B==5 ~ Q31C15A/(52/12), #<- reported monthly
                                 Q31C15B==6 ~ Q31C15A/52,#<-reported annually
                                 TRUE ~ Q31C15A)) %>% #<- reported weekly
    mutate(chcareExpSecond07 = case_when(Q31C21B==1 ~ Q31C21A*Q31C19, #<- cost reported hourly
                                 Q31C21B==2 ~ Q31C21A*Q31C19/3.1, #<- reported daily?
                                 Q31C21B==4 ~ Q31C21A/2, #<- reported fortnightly
                                 Q31C21B==5 ~ Q31C21A/(52/12), #<- reported monthly
                                 Q31C21B==6 ~ Q31C21A/52,#<-reported annually
                                 TRUE ~ Q31C21A)) %>% #<- reported weekly
  mutate(Price07 = chcareExp07/Q31C13) %>% #<- derive hourly price
  mutate(PriceSecond07 = chcareExpSecond07/Q31C19) %>% #<- derive hourly price
  mutate(Arrange07 = case_when(Q31C12==1 | Q31C12==2  ~ "Relative",
                               Q31C12==3 ~ "Non Relative",
                               Q31C12==5 ~ "Family Center",
                               Q31C12==7 ~ "Care Center",
                               Q31C12==8 ~ "Before/After School Prog."))%>%
  mutate(ArrangeSecond07 = case_when(Q31C18==1 | Q31C18==2  ~ "Relative",
                               Q31C18==3 ~ "Non Relative",
                               Q31C18==5 ~ "Family Center",
                               Q31C18==7 ~ "Care Center",
                               Q31C18==8 ~ "Before/After School Prog."))

```


One issue is that in 1997, there is a question about whether the arrangement covers other children, and the number. It seems that often it does. It doesn't appear that the same question is asked in 2002/2007. Currently we are not attempting to normalize by the number of children covered in the arrangement.

### Collecting Data for Younger Children
In addition, for children younger than 5, the data were collected somewhat differently in 1997. These were done as part of retrospective history of child care use. To use these data we first have to clean the history to find current arrangements. Then we put all the data together. The variables are:

Vname      Variable Description
---------- ----------------------------------------
Q1H1Y      AGE 1ST CARED BY SOMEONE ELSE (YRS) 97  
Q1H1M      AGE 1ST CARED BY SOMEONE ELSE (MTHS) 97 
Q1H1A      BEFORE/AFTER K'GARTEN 97                
Q1H2_1     ARRNGMNT #1-REASON 97                   
Q1H3_1Y    ARRNGMNT #1-AGE START (YRS) 97          
Q1H3_1M    ARRNGMNT #1-AGE START (MTHS) 97         
Q1H4_1     ARRNGMNT #1-TYPE 97                     
Q1H5_1     ARRNGMNT #1-DAYS PER WEEK 97            
Q1H6_1     ARRNGMNT #1-HRS PER WEEK 97             
Q1H7_1     ARRNGMNT #1-COST 97                     
Q1H7A_1    ARRNGMNT #1- PAYMENT RATE 97            
Q1H7B_1    ARRNGMNT #1-OTHER CHILDREN 97           
Q1H7C_1    ARRNGMNT #1-# OF OTHER CHILDREN 97      
Q1H8_1Y    ARRNGMNT #1-AGE STOP (YRS) 97           
Q1H8_1M    ARRNGMNT #1-AGE STOP (MTHS) 97          
Q1H9_1     ARRNGMNT #1-REASON STOP 97              
Q1H10_1    ARRNGMNT #1-OTHER ARRNGMNTS 97     


```{r}
     
# this function calculates the weekly rate given the data
CalcRate <- function(varname,rate,cost,days,hours,Ch) {
  I_miss <- Ch[,cost]>=99998
  Ch[I_miss,cost] <- NA
  Ch[,varname] <- Ch[,cost]
  # Case 0: Weekly
  Ic = (Ch[,rate]==3)
  Ch[Ic,varname] = Ch[Ic,cost]
  # Case 1: Hourly
  Ic = (Ch[,rate]==1)
  Ch[Ic,varname] = Ch[Ic,cost]*Ch[Ic,hours]
  # Case 2: Daily
  Ic = (Ch[,rate]==2)
  Ch[Ic,varname] = Ch[Ic,cost]*Ch[Ic,days]
  # Case 4: Fortnightly
  Ic = (Ch[,rate]==4)
  Ch[Ic,varname] = Ch[Ic,cost]/2
  # Case 5: Monthly
  Ic = (Ch[,rate]==5)
  Ch[Ic,varname] = Ch[Ic,cost]/4
  # Case 6: Annual
  Ic = (Ch[,rate]==6)
  Ch[Ic,varname] = Ch[Ic,cost]/52
  Ch
}


ch_hist <- readxl::read_excel("../../../data/data_PSID/cds/childcare/Childcare_history.xlsx") %>%
  mutate(KID = ER30001*1000 + ER30002)


for (i in seq(1,8)) {
  vname = paste("exp97_",i,sep="")
  rate = paste("Q1H7A_",i,sep="")
  cost = paste("Q1H7_",i,sep="")
  days = paste("Q1H5_",i,sep="")
  hours = paste("Q1H6_",i,sep="")
  # replace missing hours with NA:
  ch_hist[ch_hist[,hours]==998|ch_hist[,hours]==999,hours]= NA_real_
 #ch_hist[,hours] = na_if(ch_hist[,hours],998)
 #ch_hist[,hours] = na_if(ch_hist[,hours],999)
  ch_hist <- CalcRate(vname,rate,cost,days,hours,ch_hist)
}


ch_hist$Arrange <- -1
ch_hist$Freq <- 0
ch_hist$Exp97 <- -1
ch_hist$Hours97 <- -1
ch_hist$Exp97Tot <- 0
ch_hist$Hours97Tot <- 0

for (i in seq(1,nrow(ch_hist))) {
  if (ch_hist[i,"Q1H1Y"]==0) {
    ch_hist[i,"Arrange"] = 0
  }
  else {
    for (j in seq(1,8)) {
      if (ch_hist[i,paste("Q1H8_",j,"Y",sep="")]==96) {
        freq_var = paste("Q1H5_",j,sep="")
        ch_hist[i,"Exp97Tot"] = ch_hist[i,"Exp97Tot"] + ch_hist[i,paste("exp97_",j,sep="")]
        ch_hist[i,"Hours97Tot"] = ch_hist[i,"Hours97Tot"] + ch_hist[i,paste("Q1H6_",j,sep="")]
        days_per_week = ch_hist[i,freq_var]
        if (days_per_week>ch_hist[i,"Freq"] & days_per_week<8) {
          ch_hist[i,"Arrange"] = ch_hist[i,paste("Q1H4_",j,sep="")]
          ch_hist[i,"Freq"] = ch_hist[i,freq_var]
          ch_hist[i,"Exp97"] = ch_hist[i,paste("exp97_",j,sep="")]
          ch_hist[i,"Hours97"] = ch_hist[i,paste("Q1H6_",j,sep="")]
        }
      }
    }
  }
}

# now code up arrangements:
# - Childcare arrangements
ch_hist$Arrange97 <- NA
I1 <- ch_hist$Arrange==0
ch_hist$Arrange97[I1] <- "Never Cared for By Other"
I1 <- is.na(ch_hist$Arrange)
ch_hist$Arrange97[I1] <- "No Detectable Arrangement"
I1 <- ch_hist$Arrange==1 | ch_hist$Arrange==3
ch_hist$Arrange97[I1] <- "Relative"
I1 <- ch_hist$Arrange==2 | ch_hist$Arrange==4
ch_hist$Arrange97[I1] <- "Non Relative"
I1 <- ch_hist$Arrange==5
ch_hist$Arrange97[I1] <- "Head Start"
I1 <- ch_hist$Arrange==6
ch_hist$Arrange97[I1] <- "Care Center"
I1 <- ch_hist$Arrange==7
ch_hist$Arrange97[I1] <- "Before/After School Prog."
I1 <- ch_hist$Arrange==8
ch_hist$Arrange97[I1] <- "Self Care"
I1 <- ch_hist$Arrange>=97
ch_hist$Arrange97[I1] <- "Unkown Type"

# code a more basic breakdown:
ch_hist$CaretypeBroad <- "External"
ch_hist$CaretypeBroad[ch_hist$Arrange==0] <- "Never Care for by Other"
ch_hist$CaretypeBroad[is.na(ch_hist$Arrange)] <- "No Current Detectable"
  
```

Next we merge these data with the rest to create a panel.

```{r}
C97 <- Ch97[,c("KID","chcareExp97","Arrange97","Price97","chcareExpSecond97","ArrangeSecond97","PriceSecond97")]
C97$year <- 1997
C97 <- rename(C97,chcare = chcareExp97,Arrange = Arrange97, Price = Price97,
              chcare_second=chcareExpSecond97, ArrangeSecond = ArrangeSecond97, PriceSecond = PriceSecond97)
# now load preK data and merge, replacing the child care value with this value when available
C97 <- ch_hist %>%
  select(KID,Exp97Tot) %>%
  mutate(Exp97Tot = na_if(Exp97Tot,9998)) %>%
  merge(C97) %>%
  mutate(chcare = case_when(Exp97Tot>0 ~ Exp97Tot,TRUE ~ chcare)) %>%
  select(-Exp97Tot)


C02 <- Ch02[,c("KID","chcareExp02","Arrange02","Price02","chcareExpSecond02","ArrangeSecond02","PriceSecond02")]
C02$year <- 2002
C02 <- rename(C02,chcare = chcareExp02,Arrange = Arrange02, Price = Price02,
              chcare_second=chcareExpSecond02, ArrangeSecond = ArrangeSecond02, PriceSecond = PriceSecond02)

C07 <- Ch07[,c("KID","chcareExp07","Arrange07","Price07","chcareExpSecond07","ArrangeSecond07","PriceSecond07")]
C07$year <- 2007
C07 <- rename(C07,chcare = chcareExp07,Arrange = Arrange07, Price = Price07,
              chcare_second=chcareExpSecond07, ArrangeSecond = ArrangeSecond07, PriceSecond = PriceSecond07)


cds_childcare <- rbind(C97,C02,C07) 

```

## Age by Month

Construct the age in months for CDS years

```{r}

age_month <- readxl::read_excel("../../../data/data_PSID/cds/staff-child-ratios/age_month.xlsx") %>%
      mutate(KID = ER30001*1000 + ER30002) %>%
      rename(age_month_1997=AGEATPCG)%>%
      #replace full birthdate unknown values to missing
      mutate(age_month_1997=na_if(age_month_1997,999.9))%>%
      mutate(age_month_1997=floor(age_month_1997))%>%
      mutate(month_1997 =PCGIWYR  * 12 + PCGIWMON )%>%
      mutate(month_2002 = Q21IWYR * 12 + Q21IWMTH )%>%
      mutate(month_2007 =  Q31IWYR * 12 + Q31IWMTH)

#Compute age in months at 1997 interview
age_month_1997 <- age_month %>%
      select(KID, age_month_1997)%>%
      rename(age_month=age_month_1997)%>%
      mutate(year=1997)

#Compute age in months at 2002 interview using age in months in 1997 
#and time difference between 2002 and 1997 interviews

age_month_2002 <-  age_month %>%
      select(KID, age_month_1997, month_2002, month_1997)%>%
      mutate(age_month=age_month_1997+month_2002-month_1997, year=2002)%>%
      select(-month_2002, -month_1997, -age_month_1997)

#Compute age in months at 2007 interview using age in months in 1997 
#and time difference between 2007 and 1997 interviews

age_month_2007 <-  age_month %>%
      select(KID, age_month_1997, month_2007, month_1997)%>%
      mutate(age_month=age_month_1997+month_2007-month_1997, year=2007)%>%
      select(-month_2007, -month_1997, -age_month_1997)

#Make a panel for age in months 1997, 2002, and 2007
age_month <- rbind(age_month_1997,age_month_2002, age_month_2007)


```


## Staff-child ratios

Map the age in months of each child to a staff to child ratio

```{r}
staff_ratio <- readxl::read_excel("../../../data/data_PSID/cds/staff-child-ratios/Ratios_Center_Based_2002.xlsx")


####################
#Age 0-8 months
####################
staff_ratio_0_8 <- staff_ratio %>%
  select(state_str,month_0_8)%>%
  mutate(age_month=0)%>%
  rename(child_staff_ratio=month_0_8)

# Create a panel dataset
for (m in 1:8) {
  temp <- staff_ratio %>%
    select(state_str,month_0_8)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_0_8)
  
  staff_ratio_0_8= rbind(staff_ratio_0_8,temp)
}

#Second version with cutoffs 6 months later
staff_ratio_0_14 <- staff_ratio %>%
  select(state_str,month_0_8)%>%
  mutate(age_month=0)%>%
  rename(child_staff_ratio=month_0_8)

# Create a panel dataset
for (m in 1:14) {
  temp <- staff_ratio %>%
    select(state_str,month_0_8)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_0_8)
  staff_ratio_0_14= rbind(staff_ratio_0_14,temp)
}




####################
#Age 9-17 months
####################
staff_ratio_9_17 <- staff_ratio %>%
  select(state_str,month_9_17)%>%
  mutate(age_month=9)%>%
  rename(child_staff_ratio=month_9_17)

# Create a panel dataset
for (m in 10:17) {
  temp <- staff_ratio %>%
    select(state_str,month_9_17)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_9_17)
  staff_ratio_9_17=rbind(staff_ratio_9_17,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_15_23 <- staff_ratio %>%
  select(state_str,month_9_17)%>%
  mutate(age_month=15)%>%
  rename(child_staff_ratio=month_9_17)

# Create a panel dataset
for (m in 16:23) {
  temp <- staff_ratio %>%
    select(state_str,month_9_17)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_9_17)
  staff_ratio_15_23=rbind(staff_ratio_15_23,temp)
}
####################
#Age 18-26
####################

staff_ratio_18_26 <- staff_ratio %>%
  select(state_str,month_18_26)%>%
  mutate(age_month=18)%>%
  rename(child_staff_ratio=month_18_26)

# Create a panel dataset
for (m in 19:26) {
  temp <- staff_ratio %>%
    select(state_str,month_18_26)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_18_26)
  staff_ratio_18_26=rbind(staff_ratio_18_26,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_24_32 <- staff_ratio %>%
  select(state_str,month_18_26)%>%
  mutate(age_month=24)%>%
  rename(child_staff_ratio=month_18_26)

# Create a panel dataset
for (m in 25:32) {
  temp <- staff_ratio %>%
    select(state_str,month_18_26)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_18_26)
  staff_ratio_24_32=rbind(staff_ratio_24_32,temp)
}

####################
#Age 27-35
####################


staff_ratio_27_35 <- staff_ratio %>%
  select(state_str,month_27_35)%>%
  mutate(age_month=27)%>%
  rename(child_staff_ratio=month_27_35)

# Create a panel dataset
for (m in 28:35) {
  temp <- staff_ratio %>%
    select(state_str,month_27_35)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_27_35)
  staff_ratio_27_35=rbind(staff_ratio_27_35,temp)
}


#Second version with cutoffs 6 months later

staff_ratio_33_41 <- staff_ratio %>%
  select(state_str,month_27_35)%>%
  mutate(age_month=33)%>%
  rename(child_staff_ratio=month_27_35)

# Create a panel dataset
for (m in 34:41) {
  temp <- staff_ratio %>%
    select(state_str,month_27_35)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_27_35)
  
  staff_ratio_33_41=rbind(staff_ratio_33_41,temp)
}

####################
#Age 36-47 (3 years)
####################


staff_ratio_36_47 <- staff_ratio %>%
  select(state_str,month_36_47)%>%
  mutate(age_month=36)%>%
  rename(child_staff_ratio=month_36_47)

# Create a panel dataset
for (m in 37:47) {
  temp <- staff_ratio %>%
    select(state_str,month_36_47)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_36_47)
  
  staff_ratio_36_47=rbind(staff_ratio_36_47,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_42_53 <- staff_ratio %>%
  select(state_str,month_36_47)%>%
  mutate(age_month=42)%>%
  rename(child_staff_ratio=month_36_47)

# Create a panel dataset
for (m in 43:53) {
  temp <- staff_ratio %>%
    select(state_str,month_36_47)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_36_47)
  
  staff_ratio_42_53=rbind(staff_ratio_42_53,temp)
}

####################
#Age 48-59 (4 years)
####################

staff_ratio_48_59 <- staff_ratio %>%
  select(state_str,month_48_59)%>%
  mutate(age_month=48)%>%
  rename(child_staff_ratio=month_48_59)

# Create a panel dataset
for (m in 49:59) {
  temp <- staff_ratio %>%
    select(state_str,month_48_59)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_48_59)
  
  staff_ratio_48_59=rbind(staff_ratio_48_59,temp)
}


#Second version with cutoffs 6 months later


staff_ratio_54_65 <- staff_ratio %>%
  select(state_str,month_48_59)%>%
  mutate(age_month=54)%>%
  rename(child_staff_ratio=month_48_59)

# Create a panel dataset
for (m in 55:65) {
  temp <- staff_ratio %>%
    select(state_str,month_48_59)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_48_59)
  
  staff_ratio_54_65=rbind(staff_ratio_54_65,temp)
}

####################
#Age 60-71 (5 years)
####################

staff_ratio_60_71 <- staff_ratio %>%
  select(state_str,month_60_71)%>%
  mutate(age_month=60)%>%
  rename(child_staff_ratio=month_60_71)

# Create a panel dataset
for (m in 61:71) {
  temp <- staff_ratio %>%
    select(state_str,month_60_71)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_60_71)
  
  staff_ratio_60_71=rbind(staff_ratio_60_71,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_66_77 <- staff_ratio %>%
  select(state_str,month_60_71)%>%
  mutate(age_month=66)%>%
  rename(child_staff_ratio=month_60_71)

# Create a panel dataset
for (m in 67:77) {
  temp <- staff_ratio %>%
    select(state_str,month_60_71)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_60_71)
  
  staff_ratio_66_77=rbind(staff_ratio_66_77,temp)
}

####################
#Age 72-83 (6 years)
####################


staff_ratio_72_83 <- staff_ratio %>%
  select(state_str,month_72_83)%>%
  mutate(age_month=72)%>%
  rename(child_staff_ratio=month_72_83)

# Create a panel dataset
for (m in 73:83) {
  temp <- staff_ratio %>%
    select(state_str,month_72_83)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_72_83)
  
  staff_ratio_72_83=rbind(staff_ratio_72_83,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_78_89 <- staff_ratio %>%
  select(state_str,month_72_83)%>%
  mutate(age_month=78)%>%
  rename(child_staff_ratio=month_72_83)

# Create a panel dataset
for (m in 79:89) {
  temp <- staff_ratio %>%
    select(state_str,month_72_83)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_72_83)
  
  staff_ratio_78_89=rbind(staff_ratio_78_89,temp)
}

####################
#Age 84-95 (7 years)
####################

staff_ratio_84_95 <- staff_ratio %>%
  select(state_str,month_84_95)%>%
  mutate(age_month=84)%>%
  rename(child_staff_ratio=month_84_95)

# Create a panel dataset
for (m in 85:95) {
  temp <- staff_ratio %>%
    select(state_str,month_84_95)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_84_95)
  
  staff_ratio_84_95=rbind(staff_ratio_84_95,temp)
}

#Second version with cutoffs 6 months later
staff_ratio_90_101 <- staff_ratio %>%
  select(state_str,month_84_95)%>%
  mutate(age_month=90)%>%
  rename(child_staff_ratio=month_84_95)

# Create a panel dataset
for (m in 91:101) {
  temp <- staff_ratio %>%
    select(state_str,month_84_95)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_84_95)
  staff_ratio_90_101=rbind(staff_ratio_90_101,temp)
}

####################
#Age 96-119 (8-9 years old)
####################


staff_ratio_96_119 <- staff_ratio %>%
  select(state_str,month_96_119)%>%
  mutate(age_month=96)%>%
  rename(child_staff_ratio=month_96_119)

# Create a panel dataset
for (m in 97:119) {
  temp <- staff_ratio %>%
    select(state_str,month_96_119)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_96_119)
  staff_ratio_96_119=rbind(staff_ratio_96_119,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_102_125 <- staff_ratio %>%
  select(state_str,month_96_119)%>%
  mutate(age_month=102)%>%
  rename(child_staff_ratio=month_96_119)

# Create a panel dataset
for (m in 103:125) {
  temp <- staff_ratio %>%
    select(state_str,month_96_119)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_96_119)
  
  staff_ratio_102_125 =rbind(staff_ratio_102_125 ,temp)
}


####################
#Age 120+ (10+ years)
####################


staff_ratio_120 <- staff_ratio %>%
  select(state_str,month_120)%>%
  mutate(age_month=120)%>%
  rename(child_staff_ratio=month_120)

# Create a panel dataset
for (m in 121:160) {
  temp <- staff_ratio %>%
    select(state_str,month_120)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_120)
  staff_ratio_120=rbind(staff_ratio_120,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_126 <- staff_ratio %>%
  select(state_str,month_120)%>%
  mutate(age_month=126)%>%
  rename(child_staff_ratio=month_120)

# Create a panel dataset
for (m in 127:160) {
  temp <- staff_ratio %>%
    select(state_str,month_120)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_120)
  staff_ratio_126=rbind(staff_ratio_126,temp)
}


panel_staff_ratio_6m <- rbind(staff_ratio_0_14, staff_ratio_15_23, staff_ratio_24_32, staff_ratio_33_41, staff_ratio_42_53, 
                              staff_ratio_54_65, staff_ratio_66_77, staff_ratio_78_89,staff_ratio_90_101 , 
                              staff_ratio_102_125,staff_ratio_126)%>%
                              #generate staff-to-child from child-to-staff ratio with 6 months lag
                              mutate(staff_ratio_6m=1/child_staff_ratio)%>%
                              select(-child_staff_ratio)  

panel_staff_ratio_2002 <- rbind(staff_ratio_0_8, staff_ratio_9_17, staff_ratio_18_26, staff_ratio_27_35, staff_ratio_36_47, 
                                staff_ratio_48_59, staff_ratio_60_71, staff_ratio_72_83,staff_ratio_84_95 , 
                                staff_ratio_96_119,staff_ratio_120)%>%
                                #generate staff-to-child from child-to-staff ratio
                                mutate(staff_ratio=1/child_staff_ratio)%>%
                                select(-child_staff_ratio) %>%
                                merge(panel_staff_ratio_6m)%>%
                                mutate(year=2002)

#Use 2002 ratios for 1997 interview
panel_staff_ratio_1997 <- panel_staff_ratio_2002%>%
mutate(year=1997)


#Delete all objects that start with staff_ratio to generate a 2007 panel

# List all objects in the current environment
objects <- ls()

# Identify objects starting with "staff_ratio"
objects_to_delete <- objects[grep("^staff_ratio", objects)]

# Delete objects starting with "staff_ratio"
rm(list = objects_to_delete)

###############################################################
##########################2007 ratios##########################
###############################################################

#Create a panel of mandatory staff-child ratio by state/age of child in months

staff_ratio <- readxl::read_excel("../../../data/data_PSID/cds/staff-child-ratios/Ratios_Center_Based_2007.xlsx")


####################
#Age 0-8 months
####################
staff_ratio_0_8 <-  staff_ratio %>%
  select(state_str,month_0_8)%>%
  mutate(age_month=0)%>%
  rename(child_staff_ratio=month_0_8)

# Create a panel dataset
for (m in 1:8) {
  temp <-  staff_ratio %>%
    select(state_str,month_0_8)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_0_8)
  
  staff_ratio_0_8= rbind( staff_ratio_0_8,temp)
}

#Second version with cutoffs 6 months later
staff_ratio_0_14 <-  staff_ratio %>%
  select(state_str,month_0_8)%>%
  mutate(age_month=0)%>%
  rename(child_staff_ratio=month_0_8)

# Create a panel dataset
for (m in 1:14) {
  temp <-  staff_ratio %>%
    select(state_str,month_0_8)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_0_8)
  staff_ratio_0_14= rbind(staff_ratio_0_14,temp)
}




####################
#Age 9-17 months
####################
staff_ratio_9_17 <-  staff_ratio %>%
  select(state_str,month_9_17)%>%
  mutate(age_month=9)%>%
  rename(child_staff_ratio=month_9_17)

# Create a panel dataset
for (m in 10:17) {
  temp <-  staff_ratio %>%
    select(state_str,month_9_17)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_9_17)
  staff_ratio_9_17=rbind(staff_ratio_9_17,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_15_23 <-  staff_ratio %>%
  select(state_str,month_9_17)%>%
  mutate(age_month=15)%>%
  rename(child_staff_ratio=month_9_17)

# Create a panel dataset
for (m in 16:23) {
  temp <-  staff_ratio %>%
    select(state_str,month_9_17)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_9_17)
  staff_ratio_15_23=rbind(staff_ratio_15_23,temp)
}
####################
#Age 18-26
####################

staff_ratio_18_26 <-  staff_ratio %>%
  select(state_str,month_18_26)%>%
  mutate(age_month=18)%>%
  rename(child_staff_ratio=month_18_26)

# Create a panel dataset
for (m in 19:26) {
  temp <-  staff_ratio %>%
    select(state_str,month_18_26)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_18_26)
  staff_ratio_18_26=rbind(staff_ratio_18_26,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_24_32 <-  staff_ratio %>%
  select(state_str,month_18_26)%>%
  mutate(age_month=24)%>%
  rename(child_staff_ratio=month_18_26)

# Create a panel dataset
for (m in 25:32) {
  temp <-  staff_ratio %>%
    select(state_str,month_18_26)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_18_26)
  staff_ratio_24_32=rbind(staff_ratio_24_32,temp)
}

####################
#Age 27-35
####################


staff_ratio_27_35 <-  staff_ratio %>%
  select(state_str,month_27_35)%>%
  mutate(age_month=27)%>%
  rename(child_staff_ratio=month_27_35)

# Create a panel dataset
for (m in 28:35) {
  temp <-  staff_ratio %>%
    select(state_str,month_27_35)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_27_35)
  staff_ratio_27_35=rbind(staff_ratio_27_35,temp)
}


#Second version with cutoffs 6 months later

staff_ratio_33_41 <-  staff_ratio %>%
  select(state_str,month_27_35)%>%
  mutate(age_month=33)%>%
  rename(child_staff_ratio=month_27_35)

# Create a panel dataset
for (m in 34:41) {
  temp <-  staff_ratio %>%
    select(state_str,month_27_35)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_27_35)
  
  staff_ratio_33_41=rbind(staff_ratio_33_41,temp)
}

####################
#Age 36-47 (3 years)
####################


staff_ratio_36_47 <-  staff_ratio %>%
  select(state_str,month_36_47)%>%
  mutate(age_month=36)%>%
  rename(child_staff_ratio=month_36_47)

# Create a panel dataset
for (m in 37:47) {
  temp <-  staff_ratio %>%
    select(state_str,month_36_47)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_36_47)
  
  staff_ratio_36_47=rbind(staff_ratio_36_47,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_42_53 <-  staff_ratio %>%
  select(state_str,month_36_47)%>%
  mutate(age_month=42)%>%
  rename(child_staff_ratio=month_36_47)

# Create a panel dataset
for (m in 43:53) {
  temp <- staff_ratio %>%
    select(state_str,month_36_47)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_36_47)
  
  staff_ratio_42_53=rbind(staff_ratio_42_53,temp)
}

####################
#Age 48-59 (4 years)
####################

staff_ratio_48_59 <- staff_ratio %>%
  select(state_str,month_48_59)%>%
  mutate(age_month=48)%>%
  rename(child_staff_ratio=month_48_59)

# Create a panel dataset
for (m in 49:59) {
  temp <- staff_ratio %>%
    select(state_str,month_48_59)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_48_59)
  
  staff_ratio_48_59=rbind(staff_ratio_48_59,temp)
}


#Second version with cutoffs 6 months later


staff_ratio_54_65 <- staff_ratio %>%
  select(state_str,month_48_59)%>%
  mutate(age_month=54)%>%
  rename(child_staff_ratio=month_48_59)

# Create a panel dataset
for (m in 55:65) {
  temp <- staff_ratio %>%
    select(state_str,month_48_59)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_48_59)
  
  staff_ratio_54_65=rbind(staff_ratio_54_65,temp)
}

####################
#Age 60-119 (5-10 years)
####################

staff_ratio_60_119 <- staff_ratio %>%
  select(state_str,month_60_119)%>%
  mutate(age_month=60)%>%
  rename(child_staff_ratio=month_60_119)

# Create a panel dataset
for (m in 61:119) {
  temp <- staff_ratio %>%
    select(state_str,month_60_119)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_60_119)
  
  staff_ratio_60_119=rbind(staff_ratio_60_119,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_66_125 <- staff_ratio %>%
  select(state_str,month_60_119)%>%
  mutate(age_month=66)%>%
  rename(child_staff_ratio=month_60_119)

# Create a panel dataset
for (m in 67:125) {
  temp <- staff_ratio %>%
    select(state_str,month_60_119)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_60_119)
  
  staff_ratio_66_125=rbind(staff_ratio_66_125,temp)
}


####################
#Age 120+ (10+ years)
####################


staff_ratio_120 <- staff_ratio %>%
  select(state_str,month_120)%>%
  mutate(age_month=120)%>%
  rename(child_staff_ratio=month_120)

# Create a panel dataset
for (m in 121:160) {
  temp <- staff_ratio %>%
    select(state_str,month_120)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_120)
  staff_ratio_120=rbind(staff_ratio_120,temp)
}

#Second version with cutoffs 6 months later

staff_ratio_126 <- staff_ratio %>%
  select(state_str,month_120)%>%
  mutate(age_month=126)%>%
  rename(child_staff_ratio=month_120)

# Create a panel dataset
for (m in 127:160) {
  temp <- staff_ratio %>%
    select(state_str,month_120)%>%
    mutate(age_month= m)%>%
    rename(child_staff_ratio=month_120)
  staff_ratio_126=rbind(staff_ratio_126,temp)
}





panel_staff_ratio_6m_2007 <- rbind(staff_ratio_0_14, staff_ratio_15_23, staff_ratio_24_32, staff_ratio_33_41, staff_ratio_42_53, 
                                   staff_ratio_54_65, staff_ratio_66_125,staff_ratio_126)%>%
                                    #generate staff-to-child from child-to-staff ratio with 6 months lag
                                    mutate(staff_ratio_6m=1/child_staff_ratio)%>%
                                    select(-child_staff_ratio)  

panel_staff_ratio_2007 <- rbind(staff_ratio_0_8, staff_ratio_9_17, staff_ratio_18_26, staff_ratio_27_35, staff_ratio_36_47, 
                                staff_ratio_48_59, staff_ratio_60_119,staff_ratio_120)%>%
                                #generate staff-to-child from child-to-staff ratio
                                mutate(staff_ratio=1/child_staff_ratio)%>%
                                select(-child_staff_ratio) %>%
                                merge(panel_staff_ratio_6m_2007)%>%
                                mutate(year=2007)


#Delete all objects that start with staff_ratio again

# List all objects in the current environment
objects <- ls()

# Identify objects starting with "staff_ratio"
objects_to_delete <- objects[grep("^staff_ratio", objects)]

# Delete objects starting with "staff_ratio"
rm(list = objects_to_delete)


#Merge all  panels with ratios together
staff_ratio_panel <- rbind(panel_staff_ratio_1997, panel_staff_ratio_2002, panel_staff_ratio_2007)

```

# Assembling a Panel of CDS Mothers

## Making a Panel with Fertility History

We start by loading the childbirth history file, which we will use to
identify mothers and arrange their fertility history into a panel
dataset. It will be useful to know the following mapping between
variable names and variables:

| Vnum   | Vname                    |
|:-------|:-------------------------|
| CAH3   | intnum68                 |
| CAH4   | pernum                   |
| CAH5   | sex                      |
| CAH6   | parent month of birth    |
| CAH7   | parent year of birth     |
| CAH8   | marital status when born |
| CAH9   | birth order              |
| CAH10  | child intnum             |
| CAH11  | child pernum             |
| CAH12  | sex of child             |
| CAH13  | child month of birth     |
| CAH15  | child year of birth      |
| CAH108 | number of birth records  |

The chunk below loads these data and applies some sample restrictions: -
Drop women without recorded births - Drop women with missing birth year
information - Drop women with missing information in their childbirth
history - Keep only women whose children appear in the CDS

First, we use one of the cleaned CDS files to get a file with ID
numbners of all CDS children.

```{r}
# get an index of CDS children
KID <- assessment_panel %>%
  select(KID) %>%
  unique()
```

Next, we load the childbirth record for women in the PSID and apply some sample restrictions.

```{r}
child_record <- read.csv("../../../data/data_PSID/main/childbirth/Childbirth.csv") %>%
  mutate(X = NULL) %>% #<- load childbirth file
  filter(CAH5 == 2) %>% # keep only women
  filter(CAH10 != 0) %>% # drop women who have no children according to this file
  filter(CAH7 != 9998) %>% # drop women with missing birth year info
  mutate(MID = CAH3 * 1000 + CAH4, KID = CAH10 * 1000 + CAH11) %>%
  group_by(MID) %>%
  filter(!any(CAH10 == 9999))
```

We use this childbirth record to first create a data frame with the IDs of all CDS mothers.

Then, we use that file to produce a full childbirth history file with
just CDS mothers.

```{r}

mother_index <- child_record %>%
  select(MID, KID) %>%
  inner_join(KID) %>%
  select(MID) %>%
  unique()

C <- child_record %>% #<- this matches the sample from before. All good.
  inner_join(mother_index)
```

We want a panel dataset that begins when each mother turns 18 and ends
in 2017. Each individual-year observation has the number of dependent
children, the age of the youngest child, the number of children between
0 and 5, and the number of children between 6 and 12.

The function below does this assuming that some dataframe `d` contains
the childbirth data for just one mother:

```{r}
MakePanel <- function(d) {
  year <- (d$CAH7[1] + 16):2017
  n_y <- length(year)
  knum <- d$CAH108[1]
  age_youngest <- integer(n_y) - 1
  age_oldest <- integer(n_y) + Inf
  num_0_5 <- integer(n_y)
  num_6_12 <- integer(n_y)
  num_child <- integer(n_y)
  m_age <- year - d$CAH7[1]
  for (i in 1:length(year)) {
    ay <- Inf
    ao <- -1
    a5 <- 0
    a6 <- 0
    for (k in 1:nrow(d)) {
      agek <- year[i] - d$CAH15[k]
      if (agek >= 0 && agek <= 18) {
        ay <- min(ay, agek)
        ao <- max(ao, agek)
        num_child[i] <- num_child[i] + 1
      }
      if (agek >= 0 && agek <= 5) {
        a5 <- a5 + 1
      }
      if (agek > 5 && agek <= 12) {
        a6 <- a6 + 1
      }
    }
    age_youngest[i] <- ay
    age_oldest[i] <- ao
    num_0_5[i] <- a5
    num_6_12[i] <- a6
  }
  data.frame(MID = d$MID[1], m_age = m_age, year = year, knum = knum, num_child = num_child, age_youngest = age_youngest, age_oldest = age_oldest, num_0_5 = num_0_5, num_6_12 = num_6_12, y_first = d$CAH15[1])
}
```

The chunk below applies the function for each mother (defined by their
ID number `MID`) which results in a panel dataset. In order to speed up
analysis, we also drop observations that we know will be too early for
the CDS (everyone with first births before 1960).

```{r}

mother_panel <- C %>%
  group_by(MID) %>%
  do(MakePanel(.)) %>%
  filter(y_first >= 1960) %>%
  left_join(passage_comprehension) # Add mother's cognitive score

```

## Merge in State Identifiers and Childcare Expenditures

We start by merging this panel with the Individual Cross-Year File,
which maps an (ID,year) combination with an interview number and
sequence number (which allows us to link with state codes and household
child care expenditures)

This chunk loads the individual file and creates the ID number:

```{r}
Ind <- identifiers_panel %>%
  mutate(MID = intnum68 * 1000 + pernum, mar_stat = mpair > 0)
```

Since we would like to see marital status in some non-interview years,
we use the following year's marital status:

```{r}
Ind <- Ind %>%
  filter(year >= 1999) %>%
  mutate(year = year - 1) %>%
  rbind(Ind)
```

This method of imputation ensures that we only fill in missing data from non-interview years.

One thing that will be useful throughout is a panel with the mother's ID linked to their interview and calendar year. Some variables from the main interview refer to the previous year and things can get messy, so we use this file to keep things consistent.

```{r}
# this will be useful throughout: create a dataframe that maps mothers to their interview number and sequence number
D_Ind <- mother_panel %>%
  inner_join(Ind)
```

Next we merge the panel of state codes with the cross-year individual file. We then merge that file with a state code cross-walk so that we can link the PSID's state code with FIPS and SOI. We also merge, based on the household interview number and year, annual household expenditures on child care (cleaned in a previous section).

Finally, this file is merged with the panel of mothers we just created, using years before and after to fill in missing state observations.

```{r}
main_childcare <- main_childcare %>%
  inner_join(D_Ind) %>% #<- merge to mothers through the panel file
  mutate(year = year - 1) %>% #<- convert survey year to lag year
  select(MID, year, childcare_exp)
```
```{r}
mother_panel <- main_state %>%
  right_join(Ind) %>% #<- merge with Individual file
  right_join(mother_panel) %>% #<- merge with the panel we created (keeping all years)
  arrange(MID, year) %>%
  group_by(MID) %>%
  fill(state, .direction = "downup") %>% #<- fill in missing observations
  merge(state_codes) %>% #<- merge in the cross-walk with FIPS codes and SOI codes
  left_join(main_childcare) #<= use only the main file to link
```

## Identify and Merge in Spouses ("Fathers")

The next step is to identify spouses for each mother in each year.
Spouses are linked by the `mpair` variable which is unique within a
household interview. We identify partners by: 

1. Merging the individual file with the triple (`MID`,`year`,`m_age`) from the panel, keeping all observations from the individual file. 
2. Partners are identified by the fact that they are not successfully matched in this merge, meaning that `m_age` will be missing. We keep those observations and rename the variables in order to merge them once more with the panel of mothers.

```{r}
mother_panel <- mother_panel %>%
  select(MID, year, m_age) %>%
  right_join(Ind) %>% #<- merge with individual file, keep all observations from Ind
  filter(is.na(m_age)) %>% #<- identify spouses by missing info from the merge
  rename(FID = MID, snF = sn) %>% #<- rename variables for merge
  filter(mpair > 0) %>% #<- only keep individuals who are not mothers but who are in a relationship in the same household
  select(FID, year, intnum, mpair, snF) %>%
  right_join(mother_panel) %>% #<- do the merge
  mutate(snF = replace_na(snF, -1)) #<- replace missing values.
```

Create distance to parents files in 1988 using ID's.

```{r}
#Load miles to parents in 1988

d <- parent_distance %>%
  rename(intnum88=intnum)

#Match miles to parents in 1988 by family intnum for mothers and fathers
id <- Ind  %>% 
  filter(year==1988) %>%
  rename(id = MID)%>%
  select(id, sn, intnum)%>%
  rename(intnum88=intnum)

# mothers: use mid from the main sample, assign distance based on family intnum and sn
mother_distance <- mother_panel %>%
  select(MID) %>%
  unique() %>%
  merge(id,by.x="MID",by.y="id") %>%
  merge(d) %>%
  mutate(m_parents_hundred = case_when(sn==1 ~ head_hundred_miles,sn==2 ~ sp_hundred_miles),
         m_parents_ten = case_when(sn==1 ~ head_ten_miles,sn==2 ~ sp_ten_miles)) %>%
  select(MID,m_parents_hundred,m_parents_ten)


# fathers: use fid from the main sample, assign distance based on family intnum and sn
father_distance <- mother_panel %>%
  select(FID) %>%
  drop_na() %>%
  unique() %>%
  merge(id,by.x="FID",by.y="id") %>%
  merge(d) %>%
  mutate(f_parents_hundred = case_when(sn==1 ~ head_hundred_miles,sn==2 ~ sp_hundred_miles),
         f_parents_ten = case_when(sn==1 ~ head_ten_miles,sn==2 ~ sp_ten_miles)) %>%
  select(FID,f_parents_hundred,f_parents_ten)

```

## Merge in Spouse Birth Year and Marriage Summary Variables

The *Marriage History File* is prepared by the PSID and contains a
record of each marriage in the PSID. We use this file to determine which
individuals have ever been married, to filter out individuals with
missing marriage information, and to identify the birth year of spouses.

First we load the marriage file and create an "ever" married summary
variable. We finish by merging this file with the main panel (based on
MID)

```{r}
# #  ---- load marriage info and create some summary variables
M <- read.csv("../../../data/data_PSID/main/marriage/Marriage.csv") %>%
  mutate(X = NULL) %>%
  rename(MID = ID1) %>% #<- we rename to MID in preparation for a first merge with sample mothers
  rename(ybirth = MH6) %>%
  filter(ybirth < 9998) %>% #<- drop individuals with missing birth info
  rename(ymar = MH11) %>%
  filter(ymar != 9998) %>% #<- remove individuals without marriage info
  mutate(record = ymar < 9999) %>% #<- ymar==9999 indicates never married
  group_by(MID) %>%
  summarise(ever_married = sum(record) > 0, ybirth = ybirth[1])

mother_panel <- M %>%
  inner_join(mother_panel)
```

We also use this file to merge with FID for each spouse to deduce the
spouse's age. This means we don't have the spouse's age for cohabiting
couples. We use the triple *year*, *intnum* and *mpair* to match women
with their spouses. To do this, we first create an individual index file
of people that are *not* women in the sample. This is the potential pool
of fathers that we use for the merge.

```{r}

mother_panel <- D_Ind %>%
  select(MID, year, m_age) %>%
  right_join(Ind) %>%
  filter(is.na(m_age)) %>% #<- pick out individuals from the merge who were *not* matched
  rename(FID = MID, snF = sn) %>%
  select(c("FID", "year", "intnum", "mpair", "snF")) %>%
  filter(mpair > 0) %>%
  right_join(mother_panel)

mother_panel <- M %>%
  rename(FID = MID, ybirthF = ybirth) %>%
  right_join(mother_panel) %>%
  mutate(f_age = year - ybirthF)

```

## Merge in Mother's Race, Education, and CPI (external)

Race and education are previously cleaned summary variables. We drop mothers from the sample with missing information on education and race. 

```{r}
# normalized to year 2002
cpi = read.csv("../../../data/data_PSID/CPI-U.csv") %>%
  rename(year = YEAR) %>%
  mutate(CPIU = CPIU/CPIU[56])
```

Then we merge them into the main file.

```{r}
mother_panel <- mother_panel %>% #<- drop if mother has missing race or education info
  inner_join(education, by = c("MID" = "ID")) %>%
  rename(m_ed = ed) %>%
  left_join(education, by = c("FID" = "ID")) %>%
  rename(f_ed = ed) %>%
  left_join(race, by = c("MID" = "ID")) %>%
  rename(m_race = Race) %>%
  left_join(race, by = c("FID" = "ID")) %>%
  rename(f_race = Race)%>%
  left_join(cpi)
```

## Clean and Prepare Labor Market Data

Labor market earnings and hours come from the main interview of the
PSID, which collects information for the "husband", the "wife" and other family members. Individuals can be matched to these variables via their sequence number in the household (which we will do below). 

We first merge with the main panel and assign wages, earnings, and
hours, based on the mother's and father's sequence number, which tells
us if they are either the "husband" or "wife" in the household.

```{r}
earnings_panel <- mother_panel %>%
  select(MID, FID, year, intnum, sn, snF) %>%
  inner_join(earnings_panel) %>% #<- merge based on year and interview number
  mutate(m_hrs = case_when(sn == 1 ~ hours_head, sn == 2 ~ hours_spouse), m_earn = case_when(sn == 1 ~ earn_head, sn == 2 ~ earn_spouse)) %>%
  mutate(f_hrs = case_when(snF == 1 ~ hours_head, snF == 2 ~ hours_spouse), f_earn = case_when(snF == 1 ~ earn_head, snF == 2 ~ earn_spouse)) %>%
  mutate(m_wage = m_earn / m_hrs, f_wage = f_earn / f_hrs) %>%
  select(MID, year, m_earn, m_hrs, m_wage, f_earn, f_hrs, f_wage) %>% #<- we don't keep FID because the coupling MID,FID is dynamic and may not match the pair (MID,FID) from the previous year when we merge these data back in
  mutate(year = year - 1) #<- these variable refer to the previous year so we subtract one
```

Then we do the same with the T2 supplement data.

```{r}
# we link first to mothers:
T2m <- mother_panel %>%
  select(MID) %>%
  unique() %>%
  inner_join(earnings_panel_supplement) %>% #<- first four lines pick out mothers in CDS sample
  rename(m_earn = earn, m_hrs = hrs) %>%
  mutate(m_wage = m_earn / m_hrs) %>%
  select(MID, year, m_earn, m_wage, m_hrs) #<- these are mothers so drop the FID variable

# and then to fathers
T2f <- mother_panel %>%
  select(FID) %>%
  unique() %>%
  inner_join(earnings_panel_supplement) %>% #<-first four lines pick out "fathers" in sample (recall we are labelling all spouses as fathers)
  rename(f_earn = earn, f_hrs = hrs) %>%
  mutate(f_wage = f_earn / f_hrs) %>%
  select(FID, year, f_earn, f_wage, f_hrs) #<- these are fathers so drop the MID variable

# now we link the pair (MID,FID,year) first to mothers (MID,year) and then to fathers (MID,year), *then* we append all of this to the data from the main interview
```

Join the data on total family income and taxable income of head ans spouse by family interview number and year.
```{r}

mother_panel <- mother_panel %>%
  left_join(total_income)
```


## Merge in Labor Market Earnings, Hours, Wages

We finish by (1) linking each of these files to mothers and fathers in
our panel sample; (2) Adding this to the data from the main interview;
and (3) Merging with the panel dataset.

```{r}
mother_panel <- mother_panel %>%
  select(MID, FID, year) %>%
  inner_join(T2m) %>%
  left_join(T2f) %>%
  select(-FID) %>%
  rbind(earnings_panel) %>% #<- add all of this to the original data frame with the interview year questions
  mutate(house_earn = case_when(!is.na(f_earn) ~ f_earn + m_earn, TRUE ~ m_earn), m_wage = na_if(na_if(m_wage, Inf), 0), f_wage = na_if(na_if(f_wage, Inf), 0)) %>%
  right_join(mother_panel) # %>%

#load occupation data ad assign occupation to wife and husband by interview number and sequence number
occupations <- read.csv("../../../data/data_PSID/main/occupation/output/occ.csv")

mother_panel <- mother_panel %>%
 left_join(occupations)%>%
 mutate(m_occ_major = case_when(sn == 1 ~ occ_major_h, sn == 2 ~ occ_major_s),
        m_occ_minor = case_when(sn == 1 ~ occ_minor_h, sn == 2 ~ occ_minor_s),
        f_occ_major = case_when(snF == 1 ~ occ_major_h, snF == 2 ~ occ_major_s),
        f_occ_minor = case_when(snF == 1 ~ occ_minor_h, snF == 2 ~ occ_minor_s))

```

## Sample restrictions and creating final variables

Create some missing variables.
```{r}
mother_panel <- mother_panel %>% #set to missing age of children if no children
      mutate(
      age_oldest = replace(age_oldest, which(num_child==0), NA_real_),
      age_youngest = replace(age_youngest, which(num_child==0), NA_real_),
#set to missing earnings if negative
      m_earn = replace(m_earn, which(m_earn<0), NA_real_),
      f_earn = replace(f_earn, which(f_earn<0), NA_real_),
      m_wage = replace(m_wage, which(m_wage<0), NA_real_),
      f_wage = replace(f_wage, which(f_wage<0),NA_real_),
      house_earn = replace(house_earn, which(house_earn<0), NA_real_)
    )
```

Define some new variables for later convenience in estimation.

```{r}
mother_panel <- mother_panel %>% 
    mutate(
    # generate 0/1 indicator for current marrital status
    curr_married=case_when(
      mar_stat == "TRUE" ~ 1, 
      mar_stat == "FALSE" ~ 0),
    #generate race indicators for mothers
    m_white = ifelse(m_race==1,1,0), 
    m_black=ifelse(m_race==2,1,0), 
    m_r_oth=ifelse(m_race>2,1,0),
    #generate race indicators for fathers
    f_white = ifelse(f_race==1,1,0), 
    f_black=ifelse(f_race==2,1,0), 
    f_r_oth=ifelse(f_race>2,1,0),
    #generate education indicators for fathers
    fed_hsd = ifelse(f_ed== "<12",1,0),
    fed_hs = ifelse(f_ed== "12",1,0),
    fed_scoll = ifelse(f_ed== "13-15",1,0),
    fed_coll = ifelse(f_ed== "16",1,0),
    fed_postcol = ifelse(f_ed== ">16",1,0),
    fed_collplus = fed_coll+fed_postcol,
    fed_scollplus =fed_scoll+ fed_coll+fed_postcol,
    #generate education indicators for mothers
    med_hsd = ifelse(m_ed== "<12",1,0),
    med_hs = ifelse(m_ed== "12",1,0),
    med_scoll = ifelse(m_ed== "13-15",1,0),
    med_coll = ifelse(m_ed== "16",1,0),
    med_postcol  = ifelse(m_ed== ">16",1,0),
    med_collplus = med_coll+med_postcol,
    med_scollplus = med_scoll+med_coll+med_postcol,
    #generate a variable measuring number of children younger than 12
    num_0_12=num_0_5+num_6_12,
    #mom age at oldest child's birth
    momageatbirth1 = m_age - age_oldest,
    # create measures of potential work experience
    m_exper = case_when( med_scoll == 1~m_age-20, med_coll == 1 ~ m_age-22, med_postcol == 1 ~ m_age-24, TRUE ~m_age-18),
    f_exper = case_when ( fed_scoll == 1~ f_age-20, fed_coll == 1 ~ f_age-22, fed_postcol == 1 ~ f_age-24, TRUE ~f_age-18),
    m_exper2 = m_exper*m_exper,
    f_exper2 = f_exper*f_exper)

#Generate non-labour income measure for married and single parents
mother_panel <- mother_panel %>%
  mutate(non_lab_income= case_when(
    curr_married == 1 ~ total_income-m_earn-f_earn,
    curr_married == 0 ~ total_income-m_earn)
    ) %>%
  mutate(non_lab_income=non_lab_income/52) #annual to weekly

```

Define a variable indicating whether in sample.

```{r}
mother_panel <- mother_panel %>%
  #mom is between 16 and 45 at birth of the oldest child (or missing)%>%
  mutate(
    ind_not_sample=case_when(
      momageatbirth1>=16 & momageatbirth1<=45 ~ 0,
      is.na(momageatbirth1)==1~0, 
      TRUE~1)) %>%
    #mom and dad are between 18 and 65 or age missing
  mutate(
    ind_not_sample=case_when(
      m_age>=18&m_age<=65 ~ ind_not_sample,
      is.na(m_age)==1~ind_not_sample, 
      TRUE~1)
      ) %>%
  mutate(
    ind_not_sample=case_when(
      f_age>=18&f_age<=65 ~ ind_not_sample,
      is.na(f_age)==1 ~ ind_not_sample, TRUE~1)
      )
```

Censor wages based on quantiles of the sample.
```{r}
#Calculate quantiles for observations that will remain in the sample (ind_not_sample==0) by year\broad education group

q_m <- mother_panel %>%
  filter(ind_not_sample==0)%>%
  group_by(year, med_scollplus) %>%
  summarize(
    qm1 = quantile(m_wage, 0.01, na.rm = TRUE),
    qm99 = quantile(m_wage, probs = 0.99, na.rm = TRUE))

q_f <- mother_panel %>%
  filter(ind_not_sample==0)%>%
  group_by(year, fed_scollplus) %>%
  summarize(
    qf1 = quantile(f_wage, 0.01, na.rm = TRUE),
    qf99 = quantile(f_wage, probs = 0.99, na.rm = TRUE))

mother_panel <- mother_panel %>%
  left_join(q_m) %>%
  left_join(q_f) %>%
  mutate(m_wage=case_when(m_wage >= qm1 & m_wage <= qm99 ~ m_wage, TRUE~ NA_real_)) %>%
  mutate(f_wage=  case_when(f_wage >= qf1 & f_wage <= qf99 ~ f_wage, TRUE~ NA_real_)) %>%
  select(-qf1,-qf99,-qm1,-qm99)

#Deflate wages, annual earning,  non-labour income to 2002 dollars 
mother_panel <- mother_panel %>%
  mutate(m_wage=m_wage/CPIU,
          f_wage=f_wage/CPIU,
          m_earn=m_earn/CPIU,
          f_earn=f_earn/CPIU,
          total_income=total_income/CPIU,
          tax_income=tax_income/CPIU,
          non_lab_income=non_lab_income/CPIU)


# Calculate log wage measures for trimmed wages
mother_panel <- mother_panel %>% 
  mutate(
        ln_wage_m=log(m_wage),
        ln_wage_f=log(f_wage),
        ln_wage_fm_ratio = ln_wage_f - ln_wage_m)
```


Finally, save the panel dataset.

```{r}
write.csv(mother_panel, "../../../data/data_derived/MotherPanelCDS.csv")
write.dta(mother_panel, "../../../data/data_derived/MotherPanelCDS.dta")
```

# Assembling a Panel of CDS Children

```{r}
child_panel <- KID %>%
  inner_join(C) %>%
  group_by(KID) %>%
  filter(sum(CAH2 == 2) == 0) %>% # drop children with adoption records
  rename(ybirth_child = CAH15) %>%
  mutate(ybirth_child = na_if(ybirth_child, 9998)) %>%
  select(MID, KID, ybirth_child) %>%
  merge(mother_panel) %>%
  mutate(age = year - ybirth_child) %>%
  left_join(assessment_panel) %>%
  left_join(cds_expenditures) %>%
  left_join(cds_childcare) %>%
  left_join(TD_agg) %>%
  left_join(PPE) %>%
  left_join(age_month)%>%
  left_join(staff_ratio_panel)%>%#staff to child ratios by age of child in months
  left_join(relative_present)%>%
  filter(year>=1997,year<=2007)



#Add mobility data and 1988 distance to parents
child_panel <- child_panel %>%
  left_join(head_grow_up) %>%
  left_join(mother_distance) %>%
  left_join(father_distance) 


child_panel <- child_panel %>% #generate HH weekly childcare expenditure per child
  mutate(chcare_hh=childcare_exp/52,
         chcare_hh_pc=chcare_hh/num_0_12,
         #generate total goods expenditures
         hhinvest = SchSupplies + sports + lessons + comm_grps + tutoring + Toys)

child_panel <- child_panel %>%
  #generate age at birth of the child
  mutate(momageatbirth=m_age-age)



# Sample restriction: only households with children age 0-18
child_panel <- child_panel %>%
  mutate(ind_not_sample=case_when((num_child>0 & age>=0 & age<=18) ~ ind_not_sample, TRUE~1))

#Deflate childcare and goods expenditures to 2002 dollars 
 child_panel <- child_panel %>%
    mutate(chcare=chcare/CPIU,
           chcare_second=chcare_second/CPIU,
           chcare_hh_pc=chcare_hh_pc/CPIU,
           chcare_hh =chcare_hh /CPIU,
           hhinvest=hhinvest/CPIU)
```

```{r}
write.csv(child_panel,"../../../data/data_derived/ChildPanelCDS.csv")
write.dta(child_panel, "../../../data/data_derived/ChildPanelCDS.dta")

```